In [1]:
from sklearn.cross_validation import train_test_split,cross_val_score, KFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import precision_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
import pandas as pd
import numpy as np



In [11]:
obamaData = pd.read_csv("romneyCleanedData_with_sampling.csv")
romneyData = pd.read_csv("romneyCleanedData.csv")
print(len(obamaData))
print(len(romneyData))
obamaData.dropna(axis=0,inplace=True)
romneyData.dropna(axis=0,inplace=True)
print(len(obamaData))
print(len(romneyData))

6642
5648
6629
5640


## Multinomial Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
count_vectorizer = CountVectorizer(stop_words='english', max_features=1000, analyzer = 'word', ngram_range = (1,2) )
tfidf_transformer = TfidfTransformer(use_idf=True)


k_fold = KFold(n=len(romneyData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = romneyData.iloc[train_indices]['tweet'].values
    train_y = romneyData.iloc[train_indices]['label'].values

    test_text = romneyData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(romneyData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    tfidf_vectors = tfidf_transformer.fit_transform(counts)
    targets = np.asarray(train_y, dtype="|S6")
    classifier.fit(tfidf_vectors, targets)
    
    test_counts = count_vectorizer.transform(test_text)
    predictions = classifier.predict(tfidf_transformer.transform(test_counts))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 57.8723404255
10-fold: [58.51063829787234, 55.49645390070922, 57.269503546099294, 56.737588652482266, 59.219858156028366, 59.574468085106382, 56.914893617021278, 58.51063829787234, 58.333333333333336, 58.156028368794324]
Precision: [ 0.58595014  0.50396283  0.6291821 ]
Recall: [ 0.88945615  0.2350478   0.27562808]
Confusion matrix:
[[2571  254   66]
 [1180  395  104]
 [ 637  135  298]]
F-Score: [ 0.70638855  0.3202973   0.38245343]


In [18]:
from sklearn.neural_network import MLPClassifier
clf_nn_MLP = MLPClassifier(solver='sgd', alpha=0.001,hidden_layer_sizes=(50, 10), learning_rate = 'adaptive', random_state=42,activation='tanh')

k_fold = KFold(n=len(obamaData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = obamaData.iloc[train_indices]['tweet'].values
    train_y = obamaData.iloc[train_indices]['label'].values

    test_text = obamaData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(obamaData.iloc[test_indices]['label'].values, dtype="|S6")

    counts = count_vectorizer.fit_transform(train_text)
    tfidf_vectors = tfidf_transformer(counts)
    targets = np.asarray(train_y, dtype="|S6")
    clf_nn_MLP.fit(tfidf_vectors, targets)
    
    test_counts = count_vectorizer.transform(test_text)
    predictions = clf_nn_MLP.predict(tfidf_transformer.transform(test_counts))

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 64.6100987455
10-fold: [62.594268476621416, 65.460030165912514, 64.25339366515837, 68.929110105580691, 61.085972850678736, 61.990950226244344, 63.951734539969834, 65.309200603318246, 67.571644042232279, 64.954682779456192]
Precision: [ 0.66977097  0.47877125  0.71055946]
Recall: [ 0.69719294  0.37623404  0.79457718]
Confusion matrix:
[[2016  525  350]
 [ 731  632  316]
 [ 264  160 1635]]
F-Score: [ 0.68290638  0.42001473  0.74929254]




In [17]:
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier(n_estimators=22,class_weight="balanced_subsample")

# count_vectorizer_rfc = CountVectorizer(stop_words='english', max_features=500, analyzer = 'word', ngram_range = (1,2) )
# tfidf_transformer_rfc = TfidfTransformer(use_idf=True)


k_fold = KFold(n=len(romneyData), n_folds=10)
scores = []
accuracy = []
precision = []
recall = []
confusion = np.array([[0, 0, 0],[0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = romneyData.iloc[train_indices]['tweet'].values
    train_y = romneyData.iloc[train_indices]['label'].values

    test_text = romneyData.iloc[test_indices]['tweet'].values
    test_y = np.asarray(romneyData.iloc[test_indices]['label'].values, dtype="|S6")

    counts_rfc = count_vectorizer.fit_transform(train_text)
    tfidf_vectors_rfc = tfidf_transformer.transform(counts_rfc)
    targets = np.asarray(train_y, dtype="|S6")
    
    test_counts_rfc = count_vectorizer.transform(test_text)
    test_tfidf_vector_rfc = tfidf_transformer.transform(test_counts)
    clf_rfc.fit(tfidf_vectors_rfc, targets)
    predictions = clf_rfc.predict(test_tfidf_vector_rfc)

    confusion+= confusion_matrix(test_y, predictions)
    p=precision_score(test_y, predictions, average=None)
    r=recall_score(test_y, predictions, average=None)
    score = 2*p*r/(p+r)
    scores.append(score)
    accuracy.append((predictions==test_y).sum()*100/float(len(test_y)))
    precision.append(p)  
    recall.append(r)  

print('Accuracy:', sum(accuracy)/len(accuracy))
print('10-fold:', accuracy)
print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('Confusion matrix:')
print(confusion)
print('F-Score:', sum(scores)/len(scores))

Accuracy: 36.0106382979
10-fold: [33.156028368794324, 39.893617021276597, 38.120567375886523, 33.51063829787234, 34.574468085106382, 31.914893617021278, 30.851063829787233, 33.865248226950357, 29.609929078014183, 54.609929078014183]
Precision: [ 0.51206908  0.30779092  0.21015976]
Recall: [ 0.34617797  0.51220918  0.16216677]
Confusion matrix:
[[ 995 1460  436]
 [ 556  860  263]
 [ 361  533  176]]
F-Score: [ 0.39786725  0.37543236  0.18034424]
