In [70]:
from sklearn.cross_validation import train_test_split,cross_val_score, KFold,cross_val_predict
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import precision_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import numpy as np

In [2]:
obamaData = pd.read_csv("obamaCleanedData.csv")
print(len(obamaData))
obamaData.dropna(axis=0,inplace=True)
print(len(obamaData))

5624
5600


## Multinomial Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB
clf_nb = MultinomialNB()
count_vectorizer = CountVectorizer()

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_nb.fit(counts, targets)
    predictions = clf_nb.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 58.8214285714
10-fold: [59.642857142857146, 60.178571428571431, 57.321428571428569, 60.178571428571431, 59.285714285714285, 56.607142857142854, 59.464285714285715, 59.821428571428569, 60.714285714285715, 55.0]
Precision: [ 0.56664113  0.59764883  0.6117596 ]
Recall: [ 0.70950782  0.43947822  0.62050612]
Confusion matrix:
[[1397  332  239]
 [ 687  866  417]
 [ 381  250 1031]]
F-Score: [ 0.62978161  0.50553055  0.61549503]


## Neural Network Multi Layer Perceptron

In [51]:
from sklearn.neural_network import MLPClassifier
clf_nn_MLP = MLPClassifier(solver='sgd', alpha=0.001,hidden_layer_sizes=(50, 10), learning_rate = 'adaptive', random_state=42,activation='tanh')

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_nn_MLP.fit(counts, targets)
    predictions = clf_nn_MLP.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 58.5178571429
10-fold: [58.392857142857146, 61.071428571428569, 57.142857142857146, 58.75, 58.392857142857146, 55.535714285714285, 60.714285714285715, 60.535714285714285, 58.571428571428569, 56.071428571428569]
Precision: [ 0.60942869  0.54434097  0.60740664]
Recall: [ 0.58910866  0.5589335   0.61058372]
Confusion matrix:
[[1161  533  274]
 [ 488 1101  381]
 [ 257  390 1015]]
F-Score: [ 0.59846812  0.55064503  0.60873694]




## Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier(n_estimators=22,class_weight="balanced_subsample")
count_vectorizer_rfc = CountVectorizer()
# tfidf_transformer_rfc = TfidfTransformer()

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts_rfc = count_vectorizer_rfc.fit_transform(train_text)
#     tfidf_vector_rfc = tfidf_transformer_rfc.fit_transform(counts_rfc)
    targets = np.asarray(train_y, dtype="|S6")
    
    clf_rfc.fit(counts_rfc, targets)
    test_counts_rfc = count_vectorizer_rfc.transform(test_text)
#     test_tfidf_vector_rfc = tfidf_transformer_rfc.transform(test_counts_rfc)
    predictions = clf_rfc.predict(test_counts_rfc)

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))


Accuracy: 56.8928571429
10-fold: [58.214285714285715, 54.285714285714285, 56.071428571428569, 63.571428571428569, 56.428571428571431, 54.285714285714285, 55.357142857142854, 55.178571428571431, 58.75, 56.785714285714285]
Precision: [ 0.58125928  0.53674571  0.59161975]
Recall: [ 0.58563363  0.53299897  0.59017593]
Confusion matrix:
[[1154  544  270]
 [ 510 1052  408]
 [ 324  358  980]]
F-Score: [ 0.5829062   0.53454612  0.59050181]


## Support Vector

In [37]:
from sklearn.svm import LinearSVC
clf_svc = LinearSVC(C=0.5,loss="hinge",multi_class="ovr",penalty="l2")

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_svc.fit(counts, targets)
    predictions = clf_svc.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))


Accuracy: 57.0178571429
10-fold: [57.5, 58.75, 54.821428571428569, 58.75, 56.607142857142854, 55.892857142857146, 59.285714285714285, 56.071428571428569, 56.964285714285715, 55.535714285714285]
Precision: [ 0.60145028  0.53859439  0.5705095 ]
Recall: [ 0.57897286  0.50695298  0.63418008]
Confusion matrix:
[[1141  501  326]
 [ 503  998  469]
 [ 253  355 1054]]
F-Score: [ 0.58951163  0.52140563  0.60006204]


## Voting Classifier - Naive Bayes, Neural Network, Random Forest

In [46]:
from sklearn.ensemble import VotingClassifier
clf_voting = VotingClassifier(estimators=[('mnb', clf_nb), ('nn_mlp', clf_nn_MLP), ('rfc', clf_rfc)],
                              voting='soft',n_jobs=-1)

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_voting.fit(counts, targets)
    predictions = clf_voting.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))




Accuracy: 60.6071428571
10-fold: [60.892857142857146, 61.25, 58.214285714285715, 62.678571428571431, 60.0, 60.535714285714285, 63.035714285714285, 61.25, 62.142857142857146, 56.071428571428569]
Precision: [ 0.60802957  0.58843891  0.62194957]
Recall: [ 0.67296533  0.51257117  0.63783069]
Confusion matrix:
[[1325  397  246]
 [ 563 1009  398]
 [ 293  309 1060]]
F-Score: [ 0.63844683  0.54692454  0.62928028]


## Dump your classifiers

In [48]:
import pickle

In [50]:
pickle.dump(clf_nb,open("Obama_Multinomial_NB","wb"))
pickle.dump(clf_nn_mlp,open("Obama_Multi-Layer Perceptron","wb"))
pickle.dump(clf_rfc,open("Obama_Random_Forest","wb"))
pickle.dump(clf_svc,open("Obama_SVM","wb"))
pickle.dump(clf_voting,open("Obama_Weight_Voting","wb"))

In [65]:
loaded_nb = pickle.load(open("Obama_Multinomial_NB","rb"))
loaded_nn_mlp = pickle.load(open("Obama_Multi-Layer Perceptron","rb"))
loaded_rfc = pickle.load(open("Obama_Random_Forest","rb"))
loaded_svc = pickle.load(open("Obama_SVM","rb"))
loaded_voting = pickle.load(open("Obama_Weight_Voting","rb"))

In [62]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
obamaData_counts = count_vect.fit_transform(obamaData['tweet'])
obamaData_tfidf = tfidf_transformer.fit_transform(obamaData_counts)
obamaData_labels = obamaData['label']

## Naive Bayes

In [71]:
score = cross_val_score(loaded_nb,obamaData_counts,obamaData_labels,cv=10)
predicted = cross_val_predict(loaded_nb,obamaData_counts,obamaData_labels,cv=10)
confusion = confusion_matrix(obamaData_labels, predicted)
precision=precision_score(obamaData_labels, predicted, average=None)
recall=recall_score(obamaData_labels, predicted, average=None)
print(score.mean())
print(confusion)
print(precision)
print(recall)

0.583562608162
[[1367  327  274]
 [ 661  841  468]
 [ 358  244 1060]]
[ 0.5729254   0.59560907  0.58823529]
[ 0.69461382  0.42690355  0.6377858 ]


## Random Forest 

In [72]:
score = cross_val_score(loaded_rfc,obamaData_counts,obamaData_labels,cv=10)
predicted = cross_val_predict(loaded_rfc,obamaData_counts,obamaData_labels,cv=10)
confusion = confusion_matrix(obamaData_labels, predicted)
precision=precision_score(obamaData_labels, predicted, average=None)
recall=recall_score(obamaData_labels, predicted, average=None)
print(score.mean())
print(confusion)
print(precision)
print(recall)

0.561436944269
[[1138  518  312]
 [ 529 1015  426]
 [ 303  396  963]]
[ 0.57766497  0.52617937  0.56613757]
[ 0.57825203  0.51522843  0.57942238]


## Neural Network Multi Layer Perceptron

In [None]:
score = cross_val_score(loaded_nn_mlp,obamaData_counts,obamaData_labels,cv=10)
predicted = cross_val_predict(loaded_nn_mlp,obamaData_counts,obamaData_labels,cv=10)
confusion = confusion_matrix(obamaData_labels, predicted)
precision=precision_score(obamaData_labels, predicted, average=None)
recall=recall_score(obamaData_labels, predicted, average=None)
print(score.mean())
print(confusion)
print(precision)
print(recall)

## Support Vector Machine

In [66]:
score = cross_val_score(loaded_svc,obamaData_counts,obamaData_labels,cv=10)
predicted = cross_val_predict(loaded_svc,obamaData_counts,obamaData_labels,cv=10)
confusion = confusion_matrix(obamaData_labels, predicted)
precision=precision_score(obamaData_labels, predicted, average=None)
recall=recall_score(obamaData_labels, predicted, average=None)
print(score.mean())
print(confusion)
print(precision)
print(recall)

0.56927879576


## Voting Classifier - Naive Bayes, Neural Network, Random Forest

In [67]:
score = cross_val_score(loaded_voting,obamaData_counts,obamaData_labels,cv=10)
predicted = cross_val_predict(loaded_voting,obamaData_counts,obamaData_labels,cv=10)
confusion = confusion_matrix(obamaData_labels, predicted)
precision=precision_score(obamaData_labels, predicted, average=None)
recall=recall_score(obamaData_labels, predicted, average=None)
print(score.mean())
print(confusion)
print(precision)
print(recall)

0.604811674873


In [34]:
# clfs = {}

# parameters_mlp = {'hidden_layer_sizes' : np.arange(5,12),'solver':['lbgfs'],'activation':['identity','logistic','tanh','relu'],
#                  'max_iter':[1500],'alpha':10.0 ** -np.arange(1,7)}
# clfs['mlpGrid'] = {'clf' : GridSearchCV(MLPClassifier(),parameters_mlp),'name':'MLP with Grid Search'}

# # # 2 SVC 
# # parameters_svc = {'kernel':['linear','poly','sigmoid','rbf'],'gamma':np.linspace(0.0,2.0,num=21),'C':np.linspace(.5,1.5,num=11)}
# # clfs['svcGrid'] = {'clf' : GridSearchCV(SVC(),parameters_svc),'name':'SVC with Grid Search'}

# # 3 Random forest
# parameters_rfc = {'n_estimators':[20,50,100],'max_depth':[4,None],'max_features':[2,4],
#                   'criterion':['gini','entropy'],'min_samples_split': [1, 3, 10],'min_samples_leaf': [1, 3, 10],
#                   'bootstrap':[True,False]}
# clfs['rfcGrid'] = {'clf': GridSearchCV(RandomForestClassifier(),parameters_rfc),'name':'Random Forest with Grid Search'}

# # 4 KNN
# parameters_knn = {'n_neighbors':np.arange(3,12),'weights':['distance','uniform']}
# clfs['knnGrid'] = {'clf': GridSearchCV(KNeighborsClassifier(),parameters_knn),'name':'KNearest Neighbor with Grid Search'}

# # 5 logistic regression
# parameters_lg = {'C':[1],'tol':[0.0001],'solver': ['newton-cg','lbfgs'], 'multi_class': ['multinomial']}
# clfs['lgGrid'] = {'clf': GridSearchCV(LogisticRegression(),parameters_lg),'name':'Logistic Regression with Grid Search'}

In [35]:
# clfs

In [None]:
# for clf in clfs:
#     clfs[clf]['score'] = cross_val_score(clfs[clf]['clf'],obamaData_tfidf,obamaData_labels,cv=10)
#     print(clfs[clf]['name'] + " : %0.4f (+/- %0.4f)" % (clfs[clf]['score'].mean(),clfs[clf]['score'].std()*2))
