- Experimented with a few models that performed similar in vaccine_train.ipynb. 
- Use Linear SVC to predict
- By Xiaoyi Yuan
- Aug 2017

# Move the label to the full dataset
- I made the mistake when seperating data for labeling and the rest
- I should have stored them in seperate files
- Now I have to find in the full dataset which are labeled 

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=100
pd.options.display.max_colwidth=200
from matplotlib import pyplot as plt
%matplotlib inline
import datetime
import os

In [2]:
fulldata = pd.read_csv("../data/vaccination_all_wrt_full.tsv",delimiter = '\t', dtype=str)
# delete duplicates:
fulldata= fulldata.drop_duplicates(keep="first")
print("the number of rows is: ",len(fulldata))
print("the number of column is: ", len(fulldata.columns))

the number of rows is:  660983
the number of column is:  21


In [3]:
# read the sample data (those for labeling)
#unfortunately the retweeted_id and response id here are all wrong:

data_xy=pd.read_csv("sample_xy_08_31.csv",dtype=str)
data_ja=pd.read_csv("sample_ja_08_31.csv",dtype=str)
n_read_xy=855
n_read_ja=639

# those we read
read_xy = data_xy[:n_read_xy]
read_ja = data_ja[:n_read_ja]

# those we read but did not label
delete_xy = read_xy[read_xy['label'].isnull()]
delete_ja = read_ja[read_ja['label'].isnull()]

# those we read but labeled
labeled_xy = read_xy[~read_xy['label'].isnull()]
labeled_ja = read_ja[~read_ja['label'].isnull()]

print(len(delete_xy))
print(len(delete_ja))

print(len(labeled_xy))
print(len(labeled_ja))


50
41
805
598


In [4]:
# delete read but unlabeled ones
fulldata = fulldata[~fulldata['id'].isin(delete_xy['id'])]
fulldata = fulldata[~fulldata['id'].isin(delete_ja['id'])]

In [5]:
# add the labels to the fulldata set by matching id

fulldata_xy = fulldata [fulldata['id'].isin(labeled_xy['id'])]
fulldata_ja = fulldata [fulldata['id'].isin(labeled_ja['id'])]
add_label=fulldata_xy.append(fulldata_ja)

labeled=labeled_xy.append(labeled_ja)
labeled = labeled[['id','label']]

# merge 
fulldata_add_label = pd.merge(add_label,labeled,on="id",how='left')

In [6]:
# now find the labeled_xy and labeled_ja id in the fulldata and add that to fulldata
# cannot add labeled_xy/ja directed because all other columns in that data are wrong

fulldata_no_label = fulldata [~fulldata['id'].isin(add_label['id'])]
fulldata = fulldata_no_label.append(fulldata_add_label).reset_index(drop=True)

In [10]:
# output the data into a new file (i.e. data with read&unlabeled deleted and labels moved.)
fulldata.to_csv("fulldata_08_31.csv",index=False)

# seperate training and testing dataset

In [18]:
data = pd.read_csv("fulldata_08_31.csv",dtype=str)

In [19]:
labeled = data[data['label'].notnull()]
unlabeled=data[data['label'].isnull()]

labeled.to_csv("labeled_08_31.csv",index=False)
unlabeled.to_csv("unlabeled_08_31.csv",index=False)

# compare models
- vaccination_train.py test different models on previously labeled data
- data labels changed since then, so test model again
- it's demonstrated earlier (on same data) that linear models perform better, so test linear models first.
- the code following is all from vaccination_train.py

## clean texts

In [3]:
# import data
labeled = pd.read_csv("labeled_08_31.csv",dtype=str)

In [7]:
labeled['label'].value_counts()

1     640
-1    417
0     346
Name: label, dtype: int64

In [8]:
import re
import string
import nltk

def clean_text(text):
#remove the emoji or other weird content (such as ðŸ‡ºðŸ‡) 
    text = ''.join(filter(lambda x: x in string.printable, text))
#remove urls and @
    text = re.sub(r"http\S+", "", text)
    text = re.sub("RT","",text)
    text = ' '.join(filter(lambda x:x[0]!="@",text.split()))
    text = ' '.join(filter(lambda x:x[0]!="&",text.split()))
    text= " ".join(list(map(lambda x:x.strip("#"),text.split()))) 
    return text

In [9]:
# replace the text with its cleaned version
labeled["text"]=labeled["text"].map(clean_text)

## Tokenization and modeling

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [11]:
# before testing models, create a customed tokenizer. 
import spacy 

regexp = re.compile("(?u)\\b\\w\\w+\\b")
en_nlp=spacy.load('en')
old_tokenizer=en_nlp.tokenizer
en_nlp.tokenizer=lambda string: old_tokenizer.tokens_from_list (regexp.findall(string))

def custom_tokenizer(document):
    doc_spacy=en_nlp(document,entity=False, parse=False)
    return [token.lemma_ for token in doc_spacy]



In [12]:
#split data into training (80%) and test data (20%)
X_train, X_test, y_train, y_test = train_test_split(labeled["text"], labeled["label"], test_size=0.35)

In [14]:
# modeling function
def modeling_accuracy (model,param_grid):
    vect = TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None)
    pipe = make_pipeline(vect,model)
    grid = GridSearchCV (pipe, param_grid, cv=StratifiedKFold(n_splits=5),scoring="accuracy")
    grid.fit(X_train,y_train)
    print("the best cv score is: ", grid.best_score_)
    print("parameters of best cv score are: ", grid.best_params_)
    print("the score on test set is: ", grid.score(X_test,y_test)) 

## logistic regression

In [16]:
# run logistic regression on the whole labeled data:

param_grid={'logisticregression__C':[0.1,0.01,0.001],
           'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],
           'tfidfvectorizer__min_df':[1,2,3,4,5]}

modeling_accuracy (LogisticRegression(),param_grid)

the best cv score is:  0.694840834248
parameters of best cv score are:  {'logisticregression__C': 0.1, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.743902439024


## linear SVC

In [129]:
param_grid={'linearsvc__C':[1, 0.1,0.01,0.001],
           'tfidfvectorizer__ngram_range':[(1,1),(1,2)],
           'tfidfvectorizer__min_df':[1,2]}

modeling(LinearSVC(),param_grid)

the best cv score is:  0.708013172338
parameters of best cv score are:  {'tfidfvectorizer__ngram_range': (1, 1), 'linearsvc__C': 0.001, 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.723577235772


## non-linear SVC

In [86]:
param_grid={
            'svc__C':[0.01,0.1,1,10],
            'svc__gamma':[0.01,0.1,1,10],
            'tfidfvectorizer__min_df':[1,2,3],
            'tfidfvectorizer__ngram_range':[(1,1)]}

modeling (SVC(),param_grid)

the best cv score is:  0.665873959572
parameters of best cv score are:  {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__min_df': 2, 'svc__gamma': 0.01, 'svc__C': 10}
the score on test set is:  0.61743772242


## SGD + linear SVC

In [130]:
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2)],
    'tfidfvectorizer__min_df':[1],
    "sgdclassifier__alpha":[1,0.1,0.01,0.001],
    "sgdclassifier__fit_intercept":[True],
    "sgdclassifier__learning_rate":['optimal']}

modeling (SGDClassifier(),param_grid)

the best cv score is:  0.713501646542
parameters of best cv score are:  {'sgdclassifier__alpha': 0.1, 'tfidfvectorizer__ngram_range': (1, 2), 'sgdclassifier__learning_rate': 'optimal', 'sgdclassifier__fit_intercept': True, 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.70325203252


## SGD + logistic regression

In [91]:
param_grid={
    "tfidfvectorizer__ngram_range":[(1,1),(1,2),(1,3)],
    'tfidfvectorizer__min_df':[1,2,3,4,5],
    "sgdclassifier__alpha":[1,0.1,0.01,0.001],
    "sgdclassifier__fit_intercept":[True],
    "sgdclassifier__learning_rate":['optimal'],
    "sgdclassifier__loss":['log']}

modeling(SGDClassifier(),param_grid)

the best cv score is:  0.702734839477
parameters of best cv score are:  {'sgdclassifier__alpha': 0.1, 'sgdclassifier__loss': 'log', 'tfidfvectorizer__ngram_range': (1, 2), 'sgdclassifier__learning_rate': 'optimal', 'sgdclassifier__fit_intercept': True, 'tfidfvectorizer__min_df': 1}
the score on test set is:  0.681494661922


# predict
- linear SVC and SGD+SVC performance is very similar but I choose linear SVC because the parameters are not as sensitive and it's more interpretable. 

In [18]:
# import labeled and unlabeled data

labeled = pd.read_csv("labeled_08_31.csv",dtype=str)
unlabeled = pd.read_csv("unlabeled_08_31.csv",dtype=str)

In [108]:
pos=labeled[labeled['label']=='1']
neg=labeled[labeled['label']=='-1']
neu=labeled[labeled['label']=='0']

In [110]:
print(len(pos))
print(len(neg))
print(len(neu))

640
417
346


In [111]:
print(len(pos.groupby('author')))
print(len(neg.groupby('author')))
print(len(neu.groupby('author')))

629
303
341


In [131]:
len(labeled.groupby('author'))

1253

In [112]:
len(unlabeled)

659489

In [19]:
labeled_text=labeled["text"]
target=labeled["label"]
unlabeled_text=unlabeled["text"]

In [20]:
# clean 
labeled_text=labeled_text.map(clean_text)
unlabeled_text=unlabeled_text.map(clean_text)

In [47]:
# vectorize use the best parameter from experiments in linear SVC, i.e. n_gram and min_df
vect = TfidfVectorizer(stop_words="english",tokenizer=custom_tokenizer,norm=None, 
                                   ngram_range=(1,1),min_df=1)
vect = vect.fit(labeled_text)
labeled_vect = vect.transform(labeled_text)
unlabeled_vect = vect.transform(unlabeled_text)

In [48]:
model = LinearSVC(C=0.001)
model.fit(labeled_vect,target)

LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [53]:
labeled_vect

<1403x3180 sparse matrix of type '<class 'numpy.float64'>'
	with 11892 stored elements in Compressed Sparse Row format>

In [52]:
unlabeled_vect

<659489x3180 sparse matrix of type '<class 'numpy.float64'>'
	with 4704032 stored elements in Compressed Sparse Row format>

In [57]:
len(vect.vocabulary_)

3180

In [92]:
xval = cross_val_score(model,labeled_vect,target, cv = 5, scoring="accuracy")
print(xval.mean())
print(xval.max())
print(xval.std())

0.70709538953
0.746428571429
0.0316501516449


In [103]:
xval = cross_val_score(model,labeled_vect,target, cv = 5, scoring="f1_macro")
print(xval.mean())
print(xval.max())
print(xval.std())

0.684651607003
0.73132315713
0.0354034729516


In [102]:
xval = cross_val_score(model,labeled_vect,target, cv = 5, scoring="precision_macro")
print(xval.mean())
print(xval.max())
print(xval.std())

0.730950067556
0.764464215173
0.0275413450328


In [101]:
xval = cross_val_score(model,labeled_vect,target, cv = 5, scoring="recall_macro")
print(xval.mean())
print(xval.max())
print(xval.std())

0.669108065284
0.715448751528
0.0356397314602


In [38]:
output=model.predict(unlabeled_vect)

# insert this output back to the original unlabeled dataframe
unlabeled['label']=pd.DataFrame(output)

#output the prediction
unlabeled.to_csv("unlabeled_predicted_08_31.csv",index=False)

In [40]:
# create a new file with labeled and predicted data 
fulldata_learnt = unlabeled.append(labeled)
fulldata_learnt.to_csv("fulldata_learnt_08_31.csv",index=False)

In [17]:
(0.66312057+ 0.74021352+  0.74642857+  0.68928571+  0.69642857)/5

0.707095388

In [121]:
output=model.predict(unlabeled_vect)
unlabeled['label']=pd.DataFrame(output)

In [114]:
len(output)

659489

In [118]:
len(output[output=='1'])

390787

In [119]:
len(output[output=='-1'])

151860

In [120]:
len(output[output=='0'])

116842

In [123]:
pos=unlabeled[unlabeled['label']=='1']

In [125]:
len(pos.groupby('author'))

205854

In [126]:
neg=unlabeled[unlabeled['label']=='-1']

In [127]:
len(neg.groupby('author'))

41645

In [128]:
neu=unlabeled[unlabeled['label']=='0']

In [129]:
len(neu.groupby('author'))

66054

In [130]:
len(unlabeled.groupby('author'))

269226