In [21]:
import pandas as pd
import nltk
import re
import time
from sklearn import *

In [22]:
def dataClean(raw_tweets):
    clean_tweets = []
    for tweet in raw_tweets:
        tweet = tweet.lower()
        tweet = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',tweet) #remove urls
        tweet = re.sub(r'@\w+', r'', tweet) #removes username
        tweet = re.sub(r'<[^<]+?>', r'', tweet) # removes HTML tags
        tweet = re.sub(r'[<>!#@$:.,%\?-]+', r'', tweet)
        words = tweet.split()
        tweet = ' '.join([w for w in words if not w in nltk.corpus.stopwords.words("english")])
        ps = nltk.stem.PorterStemmer()
        stemmedTweet = [ps.stem(word) for word in tweet.split(" ")]
        stemmedTweet = " ".join(stemmedTweet)
        tweet = str(stemmedTweet)
        tweet = tweet.replace("'", "")
        tweet = tweet.replace("\"","")
        clean_tweets.append(tweet)
    return clean_tweets

In [23]:
# load data from sheets into dataframes

trainingFile = "data/train.xlsx"
df_obama = pd.read_excel(trainingFile,sheet_name='Obama')
df_romney = pd.read_excel(trainingFile,sheet_name='Romney')

In [24]:
df_obama.head()

Unnamed: 0,date,time,Anootated tweet,Class,Your class
0,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,
1,2016-12-10 00:00:00,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2,
2,2012-10-16 00:00:00,10:04:30-05:00,#<e>obama</e> debates that Cracker Ass Cracker...,1,
3,2012-10-16 00:00:00,10:00:36-05:00,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2,
4,2012-10-16 00:00:00,09:50:08-05:00,@Hollivan @hereistheanswer Youre missing the ...,0,


In [25]:
df_obama.count()

date               7186
time               7185
Anootated tweet    7197
Class              7193
Your class            0
dtype: int64

In [26]:
a = pd.Series(df_obama['Class'],dtype='category')

In [27]:
a.cat.categories.tolist()

[-1, 0, 1, 2, '-1', '0', '1', '2', 'irrelevant', 'irrevelant']

In [28]:
# Removing all other categories 
df_obama = df_obama[(df_obama['Class'].isin((1,0,-1)))]

In [29]:
df_obama.count()

date               5467
time               5466
Anootated tweet    5471
Class              5471
Your class            0
dtype: int64

In [35]:
# vectorize with tf-idf
def vectorize(train_tweets,test_tweets):
    vec = feature_extraction.text.TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= 'word', ngram_range=(1,5),lowercase=True)
    train_vectors = vec.fit_transform(train_tweets)
    test_vectors = vec.transform(test_tweets)
    return train_vectors, test_vectors
    

In [36]:
df_romney= df_romney[(df_romney['Class'].isin((1,0,-1)))]

In [37]:
# creating lists for raw tweets and classes
obama_raw = df_obama['Anootated tweet'].tolist()
obama_class_train = df_obama['Class'].tolist()
romney_raw = df_romney['Anootated tweet'].tolist()
romney_class_train = df_romney['Class'].tolist()

In [38]:
romney_tweets = dataClean(romney_raw) #romney tweets cleaning
obama_tweets = dataClean(obama_raw) #obama tweets cleaning

In [42]:
testingFile = "data/test.xlsx"
df_obama_test = pd.read_excel(testingFile,sheetname='Obama')
df_romney_test = pd.read_excel(testingFile,sheetname='Romney')

#Removing the mixed class and the !!! class

df_obama_test = df_obama_test[(df_obama_test['Class'].isin((1,-1,0)))]
df_romney_test = df_romney_test[(df_romney_test['Class'].isin((1,-1,0)))]

#creating lists for raw tweets and classes

obama_tweets_raw_test = df_obama_test['Anootated tweet']
obama_class_test = df_obama_test['Class']
romney_tweets_raw_test = df_romney_test['Anootated tweet']
romney_class_test = df_romney_test['Class']

obama_tweets_raw_test = obama_tweets_raw_test.tolist()
romney_tweets_raw_test = romney_tweets_raw_test.tolist()
obama_class_train_test = obama_class_test.tolist()
romney_class_train_test = romney_class_test.tolist()

romney_tweets_test = dataClean(romney_tweets_raw_test) #romney tweets cleaning
obama_tweets_test = dataClean(obama_tweets_raw_test) #obama tweets cleaning


In [43]:
obama_tweets_vectors,obama_tweets_vectors_test = vectorize(obama_tweets,obama_tweets_test)
romney_tweets_vectors,romney_tweets_vectors_test = vectorize(romney_tweets,romney_tweets_test)

In [44]:
obama_tweets_vectors

<5471x1521 sparse matrix of type '<class 'numpy.float64'>'
	with 34296 stored elements in Compressed Sparse Row format>

In [45]:
obama_tweets_vectors_test

<1951x1521 sparse matrix of type '<class 'numpy.float64'>'
	with 11979 stored elements in Compressed Sparse Row format>

In [46]:
romney_tweets_vectors

<5648x1434 sparse matrix of type '<class 'numpy.float64'>'
	with 38058 stored elements in Compressed Sparse Row format>

In [47]:
romney_tweets_vectors_test

<1900x1434 sparse matrix of type '<class 'numpy.float64'>'
	with 12123 stored elements in Compressed Sparse Row format>

In [None]:
# def train(estimator):
#     start_time = time.clock()
#     prediction = model_selection.cross_val_predict(estimator, obam)


In [53]:
from imblearn.over_sampling import SMOTE
from scipy import sparse

obama_tweets_vectors = sparse.csr_matrix(obama_tweets_vectors)
sm = SMOTE(random_state=101)
obama_tweets_vectors, obama_class_train = sm.fit_sample(obama_tweets_vectors, obama_class_train)

romney_tweets_vectors = sparse.csr_matrix(romney_tweets_vectors)
sm = SMOTE(random_state=100)
romney_tweets_vectors, romney_class_train = sm.fit_sample(romney_tweets_vectors, romney_class_train)

In [56]:
clfs = []
def computation_test(classification):
    j = 0
    start_time = time.clock()
    clfs.append(classification)
    clf_use = clfs[j]
    
    # for obama
    clf_use.fit(obama_tweets_vectors, obama_class_train)
    preds = clf_use.predict(obama_tweets_vectors_test.toarray())
    accScore = metrics.accuracy_score(obama_class_train_test, preds)
    
    labels = [1,-1]
    precision = metrics.precision_score(obama_class_train_test,preds,average=None,labels=labels)
    recall = metrics.recall_score(obama_class_train_test,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(obama_class_train_test,preds,average=None,labels=labels)
    print(classification);print("Obama: \nOverall Acurracy: ",accScore,"\n")
    
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
        
    #romney
    #preds = model_selection.cross_val_predict(clf, romney_tweets_vectors, romney_class_train, cv=10)
    clf_use.fit(romney_tweets_vectors, romney_class_train)
    preds = clf_use.predict(romney_tweets_vectors_test.toarray())
    accScore = metrics.accuracy_score(romney_class_train_test,preds)
    labels = [1,-1]
    precision = metrics.precision_score(romney_class_train_test,preds,average=None,labels=labels)
    recall = metrics.recall_score(romney_class_train_test,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(romney_class_train_test,preds,average=None,labels=labels)
    print("Romney:\nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    end_time = time.clock()
    print("Total time taken: %0.2f seconds \n\n"%(end_time-start_time))
    j=j+1

In [57]:
models = [naive_bayes.BernoulliNB(),svm.SVC(kernel='rbf', gamma=0.58, C=0.81),tree.DecisionTreeClassifier(random_state=0),ensemble.RandomForestClassifier(criterion='entropy', n_jobs = 10),linear_model.LogisticRegression(),linear_model.SGDClassifier(),ensemble.GradientBoostingClassifier()]

j = 0
for each in models:
    computation_test(each)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Obama: 
Overall Acurracy:  0.5335725269092773 

Precision of positive class: 0.528505
Recall of positive class: 0.589347
F1-Score of positive class: 0.557271 

Precision of negative class: 0.572263
Recall of negative class: 0.569767
F1-Score of negative class: 0.571012 

Romney:
Overall Acurracy:  0.5573684210526316 

Precision of positive class: 0.436782
Recall of positive class: 0.690909
F1-Score of positive class: 0.535211 

Precision of negative class: 0.695107
Recall of negative class: 0.577083
F1-Score of negative class: 0.630620 

Total time taken: 0.13 seconds 


SVC(C=0.81, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.58, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Obama: 
Overall Acurracy:  0.5335725269092773 

Precision of positive class: 0.528505
Recall of positive class: 0.589347
F1-Sco

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
