In [2]:
from sklearn.cross_validation import train_test_split,cross_val_score, KFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import precision_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np



In [23]:
obamaData = pd.read_csv("romneyCleanedData.csv")
print(len(obamaData))
obamaData.dropna(axis=0,inplace=True)
print(len(obamaData))

5648
5640


## Multinomial Naive Bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
count_vectorizer = CountVectorizer()

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    classifier.fit(counts, targets)
    predictions = classifier.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 57.8191489362
10-fold: [56.737588652482266, 56.382978723404257, 58.865248226950357, 56.560283687943262, 58.687943262411345, 57.446808510638299, 57.446808510638299, 57.624113475177303, 59.042553191489361, 59.397163120567377]
Precision: [ 0.59904601  0.4883697   0.57072744]
Recall: [ 0.86175101  0.26892462  0.29468381]
Confusion matrix:
[[2491  306   94]
 [1086  452  141]
 [ 582  170  318]]
F-Score: [ 0.70667681  0.34641231  0.38779276]


## Neural Network Multi Layer Perceptron

In [25]:
from sklearn.neural_network import MLPClassifier
clf_nn_MLP = MLPClassifier(solver='sgd', alpha=0.001,hidden_layer_sizes=(50, 10), learning_rate = 'adaptive', random_state=42,activation='tanh')

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_nn_MLP.fit(counts, targets)
    predictions = clf_nn_MLP.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 58.3510638298
10-fold: [57.978723404255319, 55.851063829787236, 59.397163120567377, 58.51063829787234, 58.865248226950357, 60.106382978723403, 58.156028368794324, 58.865248226950357, 56.914893617021278, 58.865248226950357]
Precision: [ 0.65714026  0.45965937  0.52448887]
Recall: [ 0.73810091  0.42211797  0.41573595]
Confusion matrix:
[[2134  558  199]
 [ 765  710  204]
 [ 350  273  447]]
F-Score: [ 0.69495753  0.43971839  0.46251148]




## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier(n_estimators=22,class_weight="balanced_subsample")

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_rfc.fit(counts, targets)
    predictions = clf_rfc.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))


Accuracy: 57.2872340426
10-fold: [57.269503546099294, 56.028368794326241, 59.574468085106382, 57.269503546099294, 57.269503546099294, 57.092198581560282, 55.851063829787236, 58.333333333333336, 58.156028368794324, 56.028368794326241]
Precision: [ 0.61085424  0.45061684  0.59155306]
Recall: [ 0.7975384   0.34154242  0.3265924 ]
Confusion matrix:
[[2305  471  115]
 [ 981  574  124]
 [ 489  229  352]]
F-Score: [ 0.69161312  0.38815871  0.42008199]


## Support Vector

In [28]:
from sklearn.svm import LinearSVC
clf_svc = LinearSVC(C=0.5,loss="hinge",multi_class="ovr",penalty="l2")

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    targets = np.asarray(train_y, dtype="|S6")
    clf_svc.fit(counts, targets)
    predictions = clf_svc.predict(count_vectorizer.transform(test_text))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))


Accuracy: 56.4893617021
10-fold: [58.156028368794324, 53.01418439716312, 56.560283687943262, 57.446808510638299, 56.205673758865245, 55.673758865248224, 54.432624113475178, 57.092198581560282, 58.687943262411345, 57.624113475177303]
Precision: [ 0.6469665   0.43993353  0.47752713]
Recall: [ 0.72060285  0.38570518  0.42253144]
Confusion matrix:
[[2083  560  248]
 [ 786  648  245]
 [ 352  263  455]]
F-Score: [ 0.6814942   0.41021898  0.44733372]


In [32]:
new_data = pd.read_csv("cleaned files/romneyCleanedData_with_sampling.csv")
new_data.label.value_counts()

-1    2893
 1    2069
 0    1680
Name: label, dtype: int64