- Training vaccine twitter data: broadly testing different classifiers and parameters without final prediction.
- by Xiaoyi Yuan,
- July 2017

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=100
pd.options.display.max_colwidth=200
from matplotlib import pyplot as plt
%matplotlib inline
import datetime
import os

# Import and create dataframe

In [2]:
path ="/Users/Charlotte/Google Drive/Jaxy Project/training"
data_xy=pd.read_csv(os.path.join(path,"sample_xy_07_28.csv"))
data_ja=pd.read_csv(os.path.join(path,"sample_ja_07_28.csv"))
n_labeled_xy=807
n_labeled_ja=630

In [3]:
data_xy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 22 columns):
id              30000 non-null float64
location        19218 non-null object
country         15749 non-null object
state           12442 non-null object
zip             9582 non-null object
x               15863 non-null float64
y               15863 non-null float64
published_at    30000 non-null object
author          30000 non-null object
coords_from     15863 non-null object
mood            30000 non-null int64
retweeted_id    13036 non-null float64
response_id     1960 non-null float64
lang            30000 non-null object
text            30000 non-null object
label           780 non-null float64
autism          30000 non-null int64
measl           30000 non-null int64
mump            30000 non-null int64
vaccin          30000 non-null int64
vax             30000 non-null int64
vaxin           30000 non-null int64
dtypes: float64(6), int64(7), object(9)
memory usage: 5.0+ M

In [4]:
data_ja.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 22 columns):
id              30000 non-null float64
location        19092 non-null object
country         15616 non-null object
state           12410 non-null object
zip             9573 non-null object
x               15731 non-null float64
y               15731 non-null float64
published_at    30000 non-null object
author          30000 non-null object
coords_from     15731 non-null object
mood            30000 non-null int64
text            30000 non-null object
label           603 non-null float64
retweeted_id    13233 non-null float64
response_id     1937 non-null float64
lang            30000 non-null object
autism          30000 non-null int64
measl           30000 non-null int64
mump            30000 non-null int64
vaccin          30000 non-null int64
vax             30000 non-null int64
vaxin           30000 non-null int64
dtypes: float64(6), int64(7), object(9)
memory usage: 5.0+ M

In [5]:
#double check the number for each label. 

dfv_xy=data_xy['label'].value_counts(dropna=False)
dfv_ja=data_ja['label'].value_counts(dropna=False)

#check how many data is being read but not labeled (skipped data)
null_xy=data_xy[:n_labeled_xy]["label"].isnull().value_counts()
null_ja=data_ja[:n_labeled_ja]["label"].isnull().value_counts()

print(dfv_xy)
print(dfv_ja)
print(null_xy)
print(null_ja)

NaN     29220
 0.0      302
 1.0      271
-1.0      207
Name: label, dtype: int64
NaN     29397
 0.0      240
 1.0      213
-1.0      150
Name: label, dtype: int64
False    780
True      27
Name: label, dtype: int64
False    603
True      27
Name: label, dtype: int64


In [6]:
#get the labeled data
data_xy=data_xy[pd.notnull(data_xy["label"])]
data_ja=data_ja[pd.notnull(data_ja["label"])]

#Put xy and ja labeled data together
data=data_xy.append(data_ja)

In [7]:
#get the data with labels of either 1 or -1
data_xy_binary=data_xy[data_xy.label!= 0]
data_ja_binary=data_ja[data_ja.label!= 0]


In [8]:
print(len(data_xy_binary))
print(len(data_ja_binary))

478
363


In [9]:
data_binary=data_xy_binary.append(data_ja_binary)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1383 entries, 0 to 628
Data columns (total 22 columns):
author          1383 non-null object
autism          1383 non-null int64
coords_from     752 non-null object
country         746 non-null object
id              1383 non-null float64
label           1383 non-null float64
lang            1383 non-null object
location        891 non-null object
measl           1383 non-null int64
mood            1383 non-null int64
mump            1383 non-null int64
published_at    1383 non-null object
response_id     71 non-null float64
retweeted_id    608 non-null float64
state           601 non-null object
text            1383 non-null object
vaccin          1383 non-null int64
vax             1383 non-null int64
vaxin           1383 non-null int64
x               752 non-null float64
y               752 non-null float64
zip             450 non-null object
dtypes: float64(6), int64(7), object(9)
memory usage: 248.5+ KB


# Clean texts

In [11]:
#remove url,non-sensical words, emojis and @ (kept the content of hashtags).

import re
import string
import nltk

def clean_text(text):
#remove the emoji or other weird content (such as ðŸ‡ºðŸ‡) 
    text = ''.join(filter(lambda x: x in string.printable, text))
#remove urls and @
    text = re.sub(r"http\S+", "", text)
    text = re.sub("RT","",text)
    text = ' '.join(filter(lambda x:x[0]!="@",text.split()))
    text = ' '.join(filter(lambda x:x[0]!="&",text.split()))
    text= " ".join(list(map(lambda x:x.strip("#"),text.split()))) 
    return text

In [12]:
# replace the text with its cleaned version
data["text"]=data["text"].map(clean_text)
data_binary["text"]=data_binary["text"].map(clean_text)

In [13]:
# export a copy using today's date as the file name
'''
date=datetime.date.today()
file_name = os.path.join(path, str(date) + "." + "csv")
data.to_csv(file_name)
'''

'\ndate=datetime.date.today()\nfile_name = os.path.join(path, str(date) + "." + "csv")\ndata.to_csv(file_name)\n'

# Tokenization and modeling

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

## model 1: logistic regression



In [15]:
# before testing models, create a customed tokenizer. 
import spacy 

regexp = re.compile("(?u)\\b\\w\\w+\\b")
en_nlp=spacy.load('en')
old_tokenizer=en_nlp.tokenizer
en_nlp.tokenizer=lambda string: old_tokenizer.tokens_from_list (regexp.findall(string))

def custom_tokenizer(document):
    doc_spacy=en_nlp(document,entity=False, parse=False)
    return [token.lemma_ for token in doc_spacy]


In [16]:
#split data into training (80%) and test data (20%)
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [17]:
# run logistic regression on the whole labeled data:

pipe = make_pipeline(TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None),LogisticRegression())
param_grid={'logisticregression__C':[0.1,0.01,0.001],
           'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],
           'tfidfvectorizer__min_df':[1,2,3,4,5]}
grid=GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is: ", grid.best_score_)
print("parameters of best cv score are: ", grid.best_params_)
print("the score on test set is: ", grid.score(X_test,y_test))

the best cv score is:  0.655515370705
parameters of best cv score are:  {'logisticregression__C': 0.01, 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.671480144404


In [18]:
# run logistic regression on binary classifier data (negative and positive)

pipe = make_pipeline(TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None),LogisticRegression())
param_grid={'logisticregression__C':[0.1,0.01,0.001],
           'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],
           'tfidfvectorizer__min_df':[1,2,3,4,5]}
grid=GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is: ", grid.best_score_)
print("parameters of best cv score are: ", grid.best_params_)
print("the score on test set is: ", grid.score(X_test_binary,y_test_binary))

the best cv score is:  0.813988095238
parameters of best cv score are:  {'logisticregression__C': 0.01, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.804733727811


In [19]:
# try CountVectorizer instead of TfidfVectorizer and see which one is the best for featuring

vect=CountVectorizer(min_df=3, stop_words="english",tokenizer=custom_tokenizer,ngram_range=(1,1)).fit(X_train)
X_train=vect.transform(X_train)
param_grid={'C':[0.1,0.01,0.001]}
grid=GridSearchCV(LogisticRegression(), param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is: ",grid.best_score_)
X_test=vect.transform(X_test)
print("the score on test set is: ",grid.score(X_test,y_test))

the best cv score is:  0.641952983725
the score on test set is:  0.631768953069


- Based on the cv score and score on test set, CountVectorizer is not performing as well as TfidfVectorizer

## model 2: Support Vector Classifier (SVC)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [21]:
#SVC on 3-classifier data

pipe = make_pipeline(TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None),LinearSVC())
param_grid={'linearsvc__C':[1, 0.1,0.01,0.001],
           'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],
           'tfidfvectorizer__min_df':[1,2,3,4,5]}
grid =GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)

print("parameters of best cv score are: ", grid.best_params_)
print("the score on test set is: ", grid.score(X_test,y_test))

the best cv score is:  0.662748643761
parameters of best cv score are:  {'tfidfvectorizer__ngram_range': (1, 1), 'linearsvc__C': 0.001, 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.689530685921


In [22]:
#SVC on 2-classifier data
pipe = make_pipeline(TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None),LinearSVC())
param_grid={'linearsvc__C':[1, 0.1,0.01,0.001],
           'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],
           'tfidfvectorizer__min_df':[1,2,3,4,5]}
grid =GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is: ", grid.best_score_)
print("parameters of best cv score are: ", grid.best_params_)
print("the score on test set is: ", grid.score(X_test_binary,y_test_binary))

the best cv score is:  0.815476190476
parameters of best cv score are:  {'tfidfvectorizer__ngram_range': (1, 2), 'linearsvc__C': 0.001, 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.798816568047


- Same here, in LinearSVC model, TfidfVectorizer performs better than CountVectorizer

## model 3: non-linear SVC

In [23]:
from sklearn.svm import SVC

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [25]:
# run non-linear SVC on 3-classifier data

vect = TfidfVectorizer(stop_words="english",ngram_range=(1,1),tokenizer=custom_tokenizer,norm=None)
param_grid={
            'svc__C':[0.01,0.1,1,10],
            'svc__gamma':[0.01,0.1,1,10],
            'tfidfvectorizer__min_df':[1,2,3]
}
pipe=make_pipeline(vect,SVC())
grid=GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test) )

the best cv score is : 0.644665461121
the best parameter is:  {'svc__C': 1, 'svc__gamma': 0.01, 'tfidfvectorizer__min_df': 3}
the score on the test set is:  0.635379061372


In [26]:
# run non-linear SVC on 2-classifier data

vect = TfidfVectorizer(stop_words="english",ngram_range=(1,1),tokenizer=custom_tokenizer,norm=None)
param_grid={
            'svc__C':[0.01,0.1,1,10],
            'svc__gamma':[0.01,0.1,1,10],
            'tfidfvectorizer__min_df':[1,2,3]
}
pipe=make_pipeline(vect,SVC())
grid=GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary) )

the best cv score is : 0.782738095238
the best parameter is:  {'svc__C': 1, 'svc__gamma': 0.01, 'tfidfvectorizer__min_df': 3}
the score on the test set is:  0.751479289941


- SVM does not increase the accuracy score (linear models perform better with high dimensional data?)

## model 4: K nearest neighbors

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [29]:
#run K nearest neighbor on 3-classifier data

vect =TfidfVectorizer(min_df=3, stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "kneighborsclassifier__n_neighbors":[1,5,10,15,20,30]
}
pipe=make_pipeline(vect,KNeighborsClassifier())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test) )

the best cv score is : 0.552441229656
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__min_df': 4, 'kneighborsclassifier__n_neighbors': 1}
the score on the test set is:  0.595667870036


In [30]:
# run K nearest neighbor on 2-classifier data
vect =TfidfVectorizer(min_df=3, stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "kneighborsclassifier__n_neighbors":[1,5,10,15,20,30]
}
pipe=make_pipeline(vect,KNeighborsClassifier())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary) )

the best cv score is : 0.729166666667
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__min_df': 4, 'kneighborsclassifier__n_neighbors': 1}
the score on the test set is:  0.668639053254


- K nearest neighbors classifier by far has the lowest accuracy scores

## model 5: Nearest Centroid

In [31]:
from sklearn.neighbors import NearestCentroid

In [32]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [33]:
#Run nearest centroid on 3-classifier data

vect =TfidfVectorizer( stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5]
}
pipe=make_pipeline(vect,NearestCentroid())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test))

the best cv score is : 0.661844484629
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__min_df': 1}
the score on the test set is:  0.628158844765


In [34]:
#Run nearest centroid on 2-classifier data

vect =TfidfVectorizer( stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5]
}
pipe=make_pipeline(vect,NearestCentroid())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary))

the best cv score is : 0.787202380952
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__min_df': 2}
the score on the test set is:  0.804733727811


- the result of NearestCentroid is pretty good.

## Model 6: SGD + Linear SVM/Logistic Regression

In [35]:
from sklearn.linear_model import SGDClassifier
# SGDClassifier is a linear classifiers (SVM, logistic regression, a.o.) with SGD training

In [36]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [37]:
# run SGD + Linear SVM on 3-classifier data

vect =TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
sgd = SGDClassifier(fit_intercept=True,learning_rate='optimal')
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "sgdclassifier__alpha":[1,0.1,0.01,0.001]
}
pipe=make_pipeline(vect,sgd)
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test))

the best cv score is : 0.679023508137
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 1), 'sgdclassifier__alpha': 0.1, 'tfidfvectorizer__min_df': 2}
the score on the test set is:  0.620938628159


In [38]:
# run SGD + Linear SVM on 2-classifier data

vect =TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
sgd = SGDClassifier(fit_intercept=True,learning_rate='optimal')
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "sgdclassifier__alpha":[1,0.1,0.01,0.001]
}
pipe=make_pipeline(vect,sgd)
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary))

the best cv score is : 0.821428571429
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'sgdclassifier__alpha': 1, 'tfidfvectorizer__min_df': 1}
the score on the test set is:  0.804733727811


In [39]:
# run SGD + logistic regression on 3-classifier data

sgd_lr=SGDClassifier(loss= 'log',fit_intercept=True,learning_rate='optimal')
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "sgdclassifier__alpha":[1,0.1,0.01,0.001]
}
pipe=make_pipeline(vect,sgd_lr)
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test))

the best cv score is : 0.679023508137
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 1), 'sgdclassifier__alpha': 0.1, 'tfidfvectorizer__min_df': 2}
the score on the test set is:  0.631768953069


In [40]:
# run SGD + logistic regression on 2-classifier data

sgd_lr=SGDClassifier(loss= 'log',fit_intercept=True,learning_rate='optimal')
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "sgdclassifier__alpha":[1,0.1,0.01,0.001]
}
pipe=make_pipeline(vect,sgd_lr)
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary))

the best cv score is : 0.825892857143
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 1), 'sgdclassifier__alpha': 0.1, 'tfidfvectorizer__min_df': 1}
the score on the test set is:  0.757396449704


- SGD + linear SVM or logistic regression provides slightly lower accuracy than without SGD

## Model 7: Naive Bayes

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2)
X_train_binary,X_test_binary, y_train_binary,y_test_binary=train_test_split(data_binary["text"], data_binary["label"], test_size=0.2)

In [43]:
# 3-classifier

vect =TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "multinomialnb__alpha":[0.1,1,10]
}
pipe=make_pipeline(vect,MultinomialNB())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)

print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test))

the best cv score is : 0.660940325497
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__min_df': 1, 'multinomialnb__alpha': 10}
the score on the test set is:  0.660649819495


In [44]:
#2-classifier

vect =TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "multinomialnb__alpha":[0.1,1,10]
}
pipe=make_pipeline(vect,MultinomialNB())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary))

the best cv score is : 0.809523809524
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__min_df': 1, 'multinomialnb__alpha': 10}
the score on the test set is:  0.786982248521


In [45]:
from sklearn.naive_bayes import BernoulliNB

In [46]:
#3-classifier 

vect =TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "bernoullinb__alpha":[0.1,1,10]
}
pipe=make_pipeline(vect,BernoulliNB())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test))

the best cv score is : 0.649186256781
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 2), 'bernoullinb__alpha': 0.1, 'tfidfvectorizer__min_df': 1}
the score on the test set is:  0.649819494585


In [47]:
#3-classifier 

vect =TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "bernoullinb__alpha":[0.1,1,10]
}
pipe=make_pipeline(vect,BernoulliNB())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train_binary,y_train_binary)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test_binary,y_test_binary))

the best cv score is : 0.799107142857
the best parameter is:  {'tfidfvectorizer__ngram_range': (1, 1), 'bernoullinb__alpha': 1, 'tfidfvectorizer__min_df': 2}
the score on the test set is:  0.798816568047


- naive bayes is performing well on the data

## model 8: xgboost (needs tuning)

try

In [48]:
import sklearn
from xgboost import XGBClassifier

In [49]:
'''
vect =TfidfVectorizer(min_df=3,stop_words="english",tokenizer=custom_tokenizer,norm=None)
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)]}
pipe=make_pipeline(vect,XGBClassifier())
grid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")
grid.fit(X_train,y_train)
print("the best cv score is :", grid.best_score_)
print("the best parameter is: ", grid.best_params_)
print("the score on the test set is: ",grid.score(X_test,y_test))
'''

'\nvect =TfidfVectorizer(min_df=3,stop_words="english",tokenizer=custom_tokenizer,norm=None)\nparam_grid={\n    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)]}\npipe=make_pipeline(vect,XGBClassifier())\ngrid=GridSearchCV(pipe,param_grid,cv=StratifiedKFold(n_splits=10),scoring="accuracy")\ngrid.fit(X_train,y_train)\nprint("the best cv score is :", grid.best_score_)\nprint("the best parameter is: ", grid.best_params_)\nprint("the score on the test set is: ",grid.score(X_test,y_test))\n'

- This model needs tunning, I will NOT use the result of this model in the paper but leave it here for now. 

# Combining model results (not used in paper)

- The best results comes from logistic regression, linear SVC, multinomial naive bayes, and Bernolli naive bayes. Combine the result of these models and take the majority vote. 

In [70]:
# use the parameters tuned before to predict training data by 4 models (if it's 2 vs 2, then it goes by the result from linear SVC)

# create a new dataframe
combined_train = pd.DataFrame(data["text"][:650])
combined_train["label"]=data["label"][:650]

combined_test = pd.DataFrame(data["text"][651:])
combined_test["label"]=data["label"][651:]

In [71]:
combined_train.head()

Unnamed: 0,text,label
0,Are all u adults ready for your onslaught of vaccines? Say NO to Vaccine Mandate! NoMandates,-1.0
1,Kagro in the Morning: on Chapel Hill; vax roundup; King collapse; new AUMF state secrets,0.0
3,CANCER VACCINES??!!?!!?,0.0
4,Mexico's measles vaccination rate? 99%. The US'? 92%. Why immigrants aren't behind measles:,0.0
5,Vaccine stories not compelling without: aborted fetuses homosexuality autism Hitler fascism conspiracy promiscuity coverup U.N.,1.0


In [72]:
combined_test.head()

Unnamed: 0,text,label
673,"Illinois announced EMERGENCY voting on measles vaccine for all, no exemptions The vote is tomorrow. you know, we... ht",0.0
674,Author of controversial vaccination/autism study shunned by Salem via,0.0
675,funnyordie: clippership: Finally! A safe option for anti-vaxxers. antivax,1.0
676,autism vax pharma Excellent. The audacity of pharma and MD's that say they know more about the mothers child...,-1.0
677,Africa reaches 6 months w/out a case of polio. We must continue to reach every child w/ the polio vaccine. endpolio,0.0


In [77]:
# fill in the result of prediction by these models using the best parameters got above

#first, logistic regression

vect = TfidfVectorizer(ngram_range= (1,2), min_df=1, stop_words="english",tokenizer=custom_tokenizer,norm=None).fit(combined_train["text"])
train_vect = vect.transform(combined_train["text"])

logreg=LogisticRegression(C=0.01)
logreg.fit(train_vect,combined_train["label"])

test_vect=vect.transform(combined_test["text"])
logreg_result = logreg.predict(test_vect)
combined_test["logreg_result"]=logreg_result

In [81]:
# second, linear SVC

vect = TfidfVectorizer(ngram_range= (1,1), min_df=1, stop_words="english",tokenizer=custom_tokenizer,norm=None).fit(combined_train["text"])
train_vect=vect.transform(combined_train["text"])

svc = LinearSVC(C=0.001)
svc.fit(train_vect,combined_train['label'])

test_vect=vect.transform(combined_test['text'])
svc_result=svc.predict(test_vect)
combined_test["svc_result"]=svc_result

In [83]:
# third, multinomial naive bayes

vect = TfidfVectorizer(ngram_range= (1,2), min_df=1, stop_words="english",tokenizer=custom_tokenizer,norm=None).fit(combined_train["text"])
train_vect=vect.transform(combined_train["text"])

multinomialNB = MultinomialNB(alpha=10)
multinomialNB.fit(train_vect,combined_train['label'])

test_vect=vect.transform(combined_test['text'])
multinomialNB_result=multinomialNB.predict(test_vect)
combined_test["multinomialNB_result"]=multinomialNB_result

In [84]:
# last, Bernoulli naive bayes

bernoulliNB=BernoulliNB(alpha=10)
bernoulliNB.fit(train_vect,combined_train['label'])

bernoulliNB_result=bernoulliNB.predict(test_vect)
combined_test["bernoulliNB_result"]=bernoulliNB_result

In [85]:
combined_test.head(10)

Unnamed: 0,text,label,logreg_result,svc_result,multinomialNB_result,bernoulliNB_result
673,"Illinois announced EMERGENCY voting on measles vaccine for all, no exemptions The vote is tomorrow. you know, we... ht",0.0,1.0,1.0,1.0,0.0
674,Author of controversial vaccination/autism study shunned by Salem via,0.0,-1.0,-1.0,-1.0,0.0
675,funnyordie: clippership: Finally! A safe option for anti-vaxxers. antivax,1.0,1.0,1.0,1.0,0.0
676,autism vax pharma Excellent. The audacity of pharma and MD's that say they know more about the mothers child...,-1.0,-1.0,-1.0,-1.0,0.0
677,Africa reaches 6 months w/out a case of polio. We must continue to reach every child w/ the polio vaccine. endpolio,0.0,0.0,-1.0,1.0,0.0
678,The Anti-Vaccine Generation: How Movement Against Shots Got Its Start via,1.0,1.0,1.0,1.0,0.0
679,Powerful Anti-HIV Agent Can Work in a Vaccine,0.0,0.0,0.0,0.0,0.0
681,"No, only that I veered a bit from specific vaccine discussion, but you did that yourself too.",0.0,1.0,1.0,1.0,0.0
683,Ebola vaccine trial to start on volunteers in Liberia,0.0,0.0,0.0,0.0,0.0
684,How vaccinations may have opened up a new front in the GOP culture wars: (John Locher/AP),1.0,1.0,1.0,0.0,0.0


In [154]:
result=combined_test.iloc[:,2:].mode(axis=1)

In [155]:
result

Unnamed: 0,0,1
673,1.0,
674,-1.0,
675,1.0,
676,-1.0,
677,0.0,
678,1.0,
679,0.0,
681,1.0,
683,0.0,
684,1.0,


In [136]:
n=0
for n in range(len(result)):
    if np.isnan(result.iloc[n,1]):
        result.set_value(n,0,combined_test.iloc[n,3])
    n = n + 1

In [156]:
combined_test["majority_vote"]=result[0]

I don't think combining these models this way would work.