Import Necessary Packages

In [10]:
import pandas as pd
import numpy as np
import re
import math
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from scipy import sparse
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

Preprocess Data

In [22]:
def formatTweets(fileLocation):
    dataFrameVals = pd.read_excel(fileLocation)
    p = SnowballStemmer("english")
    stopWords = set(stopwords.words("english"))
    tweetList=dataFrameVals['Tweet']
    classList=dataFrameVals['Class']
    formattedTweets = []
    classes = []
    for record,label in zip(tweetList,classList):
        if(isinstance(label, str) != True):
            if (label !=2 and math.isnan(label) !=True):
                regexHTMLRemove = re.compile(r'<.*?>')
                tempString = regexHTMLRemove.sub(' ',str(record))                
                #Replace all URLS
                tempString = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',' ',tempString)
                #get all A to Z
                tempString = re.sub(r"[^A-Za-z ]+", ' ', tempString) 
                listOfWords = tempString.split()
                newList = []
                for word in listOfWords:
                    if word not in stopWords:
                        newList.append(p.stem(word))
                tempString = " ".join(str(x) for x in newList)              
                formattedTweets.append(tempString)
                classes.append(label)              
    return formattedTweets,classes

Enter File Location

In [23]:
fileLocation = r"C:\Users\Shvetha\Desktop\Data Mining\Project\Project 2\Data\OBR.xlsx"
tweets,classLabels=formatTweets(fileLocation)

insidious!<e>mitt romney</e>'s bain helped philip morris get u.s. high schoolers <a>hooked on cigarettes</a> http://t.co/nmkufcuq via @huffpostpol
.@wardbrenda @shortwave8669 @allanbourdius you mean like <e>romney </e><a>cheated in primary</a>?
<e>mitt romney</e> still doesn't <a>believe</a> that we <a>have a black president</a>.
<e>romney</e>'s <a>tax plan</a> deserves a 2nd look because he has a secret one that's different than the one <a>he's been lying about</a>http://t.co/arvfpq7w
hope <e>romney</e> debate prepped w/ the same people as last time.
want to know how <e>mitt romney</e> is going to be able to <a>cut by $5 trillon dollars</a>? go here it explains everything: http://t.co/t8jyt5rh
if <e>romney</e> wins the <a>presidential election</a>, the<a> worlds really ending this year.<a>
<e>romney</e>'s <a>12 million jobs scam </a>reminds me of rip torn selling pennies in the movies. sick puppy. http://t.co/kfaoshrm
<e>mitt #romney</e> <a>said</a> that <a> catching osama bin laden</

Format Data to feed the model

In [24]:
countVector = CountVectorizer()
XTrainCount = countVector.fit_transform(tweets)
XTrainTF = TfidfTransformer(use_idf=False).fit(XTrainCount).transform(XTrainCount)
tfidf_transformer = TfidfTransformer()
XTrainTFIDF = tfidf_transformer.fit_transform(XTrainCount)

Split into 10 Folds and evaluate

In [25]:
kf = KFold(n_splits=10)
sumf_1=0
sumf1=0
sumprec1=0
sumprec_1=0
sumrec1=0
sumrec_1=0
avgerror=0
for train,test in kf.split(XTrainTFIDF):
    predicted=[]
    trainset=[]
    testset=[]
    truevalues=[]
    trainlabels=[]
    sumerror=0
    for i in train:
        trainset.append(XTrainTFIDF[i].toarray()[0])
        trainlabels.append(classLabels[i])
    trainmatrix=sparse.csc_matrix(np.asarray(trainset))
    for i in test:
        testset.append(XTrainTFIDF[i].toarray()[0])
        truevalues.append(classLabels[i])
    testmatrix=sparse.csc_matrix(np.asarray(testset))
    clf = MultinomialNB().fit(np.asarray(trainset),np.asarray(trainlabels))
    predicted = clf.predict(np.asarray(testset))
    for i in range(len(predicted)):
        if(predicted[i]!=truevalues[i]):
            sumerror=sumerror+1
    avgerror=avgerror+float(sumerror)/len(test)
    sumprec_1=sumprec_1+precision_recall_fscore_support(predicted,truevalues,labels=[-1,1])[0][0]
    sumprec1=sumprec1+precision_recall_fscore_support(predicted,truevalues,labels=[-1,1])[0][1]
    sumrec_1=sumrec_1+precision_recall_fscore_support(predicted,truevalues,labels=[-1,1])[1][0]
    sumrec1=sumrec1+precision_recall_fscore_support(predicted,truevalues,labels=[-1,1])[1][1]
    sumf_1=sumf_1+precision_recall_fscore_support(predicted,truevalues,labels=[-1,1])[2][0]
    sumf1=sumf1+precision_recall_fscore_support(predicted,truevalues,labels=[-1,1])[2][1]
print("Average Precision for Class 1",sumprec1/10)
print("Average Recall for Class 1",sumrec1/10)
print("Average Precision for Class -1",sumprec_1/10)
print("Average Recall for Class -1",sumrec_1/10)
print("Average FScore for Class 1",sumf1/10)
print("Average FScore for Class -1",sumf_1/10)
print("Average Error",avgerror/10)

Average Precision for Class 1 0.102733430788
Average Recall for Class 1 0.682321560726
Average Precision for Class -1 0.959222379992
Average Recall for Class -1 0.544518556895
Average FScore for Class 1 0.173522309705
Average FScore for Class -1 0.681363886893
Average Error 0.4594326241134752
