# Load Data

Import Libraries

In [1]:
#Code : Imports
import pandas as pd
from pandas.io.json import json_normalize
import zipfile
import json
from nltk.tokenize import TweetTokenizer
import warnings; warnings.simplefilter('ignore')


In [2]:
#Code to Load Tweet Data in a DataFrame
datafolder ='Data/'
jsonfolder='/JSON/'
zippedfilepath = 'JSON.zip'
if zippedfilepath:
    zippedFolder = zipfile.ZipFile(datafolder + zippedfilepath, 'r')
    tweetjsonfiles = zippedFolder.infolist()
else:
    tweetjsonfiles = os.listdir(datafolder + jsonfolder)

tweetsDF = pd.DataFrame()
for tweetfile in tweetjsonfiles:
    if zippedfilepath:
        currjson = json.loads(zippedFolder.open(tweetfile).read())    
    else:
        currjson = json.loads(open(datafolder + jsonfolder + tweetfile).read())    
    currtweetDF = json_normalize(currjson)
    tweetsDF = tweetsDF.append(currtweetDF)

tweetsDF.index = range(len(tweetsDF.index))


Find Sentiment of Each Tweet Based on Number of Likes

In [3]:
#Code to set Sentiment Class for each tweet
def getClass(fav):
    if fav<=4:
        return "NEG"
    elif fav>10:
        return "POS"
    else:
        return "NEU"
tweetsDF['sentiment'] = tweetsDF['favorite_count'].apply(getClass)

# Extract Features and Labels for Text Classifcation

Get Tweets in English Language

In [4]:
#Code: Filter Languages
englishfilter = tweetsDF['lang']=='en'
eng_tweets = tweetsDF[englishfilter]


Functions for extracting features from text

In [5]:
#Helper Functions for Text Feature Extraction

def ProcessTextNormal(tweet_text):
    tokens = tweet_text.replace('\n','').split()
    txt_features = []
    for t in tokens:
        tfinal = t.lower()
        
        #RemoveURLS and Username
        if tfinal[0]=='@':
            pass
        else:
            #Handle HTS
            if tfinal[0]=='#':
                tfinal = tfinal[1:]
            txt_features.append(tfinal)
    return txt_features

def tokenize_tweet(tweet_text):
    tknzr = TweetTokenizer(strip_handles=True, preserve_case= False, reduce_len=True)
    tweet_tokens = tknzr.tokenize(tweet_text)
    #print(tweet_tokens)
    temptokens= []  
    for t in tweet_tokens:
        if len(t)==1 or t[0:5]=='https':
            pass
        elif t[0]=='#':
            tf= t[1:]
            temptokens.append(tf)
        else:
            temptokens.append(t)
    return " ".join(temptokens)
 
    
def tokenize_tweets_tweets(tweet_texts):
    tknzr = TweetTokenizer(strip_handles=True, preserve_case= False, reduce_len=True)
    tweet_tokens = [tknzr.tokenize(tweet_txt) for tweet_txt in tweet_texts]
    tweet_tokens_final = []
    for tokens in tweet_tokens:
        temptokens =[]
        for t in tokens:
            if len(t)==1 or t[0:5]=='https':
                pass
            elif t[0]=='#':
                tf= " "+ t[1:]
                temptokens.append(tf)
            else:
                temptokens.append(t)
        tweet_tokens_final.append(temptokens)
    return tweet_tokens_final


# TestCode
# X =candidate_data['full_text'].tolist()[0:1]
# X
# tokenize_tweets_tweets(X)

In [6]:
#Code : Extract Text Features for POS and Neg Classes
tknzr = TweetTokenizer()
normal_features = ['full_text','sentiment']
candidate_data = eng_tweets[normal_features]
classfilter = candidate_data.sentiment.isin(['POS','NEG'])
candidate_data = candidate_data[classfilter]
candidate_data.index = range(len(candidate_data.index))
candidate_data['text_features'] = candidate_data['full_text'].apply(tokenize_tweet)
#candidate_data.info()

Experimental Evaluation 

In [7]:
# Import Classifiers and Helper Librries to Evaluate
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import classification_report as ClfRep
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

classifiers = {
                'MNB' :  {'obj': MultinomialNB(), 'accuracy':0, 'prec':0, 'rec':0, 'fmeasure':0}  ,        
                'NN' :  {'obj': MLPClassifier(), 'accuracy':0, 'prec':0, 'rec':0, 'fmeasure':0} ,         
                'SGD' :  {'obj': SGDClassifier(loss='hinge', penalty='l2',\
                                 alpha=1e-3, random_state=42, max_iter=5, tol=None),\
                                'accuracy':0, 'prec':0, 'rec':0, 'fmeasure':0}   ,       
                'LogReg' :  {'obj': LogisticRegression(random_state=0), 'accuracy':0, 'prec':0, 'rec':0, 'fmeasure':0}  ,        
                'SVC' :  {'obj': LinearSVC(), 'accuracy':0, 'prec':0, 'rec':0, 'fmeasure':0}  ,        
                 'RF' :  {'obj':  RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), 'accuracy':0, 'prec':0, 'rec':0, 'fmeasure':0}      
              
              }

In [8]:
# Code to Run Experiments
def Evaluate(clfIN, X_train, X_test, y_train, y_test,labels, vectorizer):
    model_pipeline = Pipeline([
                                ('vectorizer', vectorizer),
                                ('clf', clfIN)
                            ])
    trained_model = model_pipeline.fit(X_train.values,y_train)
    pred = trained_model.predict(X_test.values)
    return accuracy_score(y_test,pred), precision_score(y_test,pred),\
                            recall_score(y_test,pred),f1_score(y_test,pred)

def RunExp(classifiers, Xcol, labelsIN, vectorizer):
    labels = LabelEncoder()
    X =  Xcol
    y =  labels.fit_transform(labelsIN)
    kfolds = KFold(n_splits=10) 
    
    
    
    for clfkey,valkey in classifiers.items():
        accuracy = 0
        precision = 0
        recall = 0 
        fmsr = 0
        folds = 1
        for train_idx, test_idx in kfolds.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            acc, prec, rec, fmeasure = Evaluate(valkey['obj'], X_train, X_test, y_train, y_test,labels, vectorizer)
            accuracy += acc
            precision += prec
            recall += rec
            fmsr += fmeasure
            folds+=1

        valkey['accuracy']= float(accuracy/folds)
        valkey['prec']= float(precision/folds)
        valkey['rec']= float(recall/folds)
        valkey['fmeasure']= float(fmsr/folds)

        print([clfkey, valkey['prec'],valkey['rec'],valkey['fmeasure'],valkey['accuracy']])


    
    

In [9]:
RunExp(classifiers, candidate_data['text_features'], candidate_data['sentiment'], CountVectorizer())

['MNB', 0.39696969696969703, 0.08454078601137426, 0.13024986709197234, 0.7641261691894603]
['NN', 0.343939393939394, 0.10192046515575925, 0.14235865144956053, 0.7560414269275029]
['SGD', 0.41505312868949235, 0.1337832102537985, 0.18597989163830642, 0.7397834233277272]
['LogReg', 0.3939393939393939, 0.07384559884559884, 0.12055167055167054, 0.7607034315895075]
['SVC', 0.34632034632034636, 0.0968699601052542, 0.1437721369539551, 0.7468649493965951]
['RF', 0.0, 0.0, 0.0, 0.762945914844649]


# Improve Accuracy

Used different Vectorizers

In [10]:
# See Results on TFIDF Vectorizer
RunExp(classifiers, candidate_data['text_features'], candidate_data['sentiment'], TfidfVectorizer())

['MNB', 0.18181818181818182, 0.018668831168831168, 0.03342245989304813, 0.7652769171756514]
['NN', 0.3075757575757576, 0.11517804091333503, 0.15621146530237437, 0.7525596766103095]
['SGD', 0.45454545454545453, 0.06816378066378066, 0.11358774914924648, 0.7699241686583459]
['LogReg', 0.0, 0.0, 0.0, 0.762945914844649]
['SVC', 0.36363636363636365, 0.07384559884559884, 0.11919191919191921, 0.7676226726859638]
['RF', 0.0, 0.0, 0.0, 0.762945914844649]


In [11]:
# # W2V
# import gensim
# model = gensim.models.Word2Vec(candidate_data['text_features'].values, size=100)
# w2v = dict(zip(model.wv.index2word, model.wv.syn0))

# # Code taken from http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
# class MeanEmbeddingVectorizer(object):
#     def __init__(self, word2vec):
#         self.word2vec = word2vec
#         # if a text is empty we should return a vector of zeros
#         # with the same dimensionality as all the other vectors
#         self.dim = len(word2vec.itervalues().next())

#     def fit(self, X, y):
#         return self

#     def transform(self, X):
#         return np.array([
#             np.mean([self.word2vec[w] for w in words if w in self.word2vec]
#                     or [np.zeros(self.dim)], axis=0)
#             for words in X
#         ])


    
# RunExp(classifiers, candidate_data['text_features'], candidate_data['sentiment'], MeanEmbeddingVectorizer(w2v))

Features Add

In [12]:
def PreprocessWord(word):
    pass    

#candidate_data['text_features'].tolist()


In [13]:
#pop_features = ['retweet_count', 'favorite_count']
#user_features = ['user.verified' ,'user.friends_count','user.followers_count','user.listed_count']
