In [9]:
# Importing essential libraries
import pandas as pd
import re
from pandas import DataFrame
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
from collections import Counter
import matplotlib.pyplot as plt
#Import Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word=word)


In [11]:
def stem(word):
    stemmer = porter_stemmer()
    return stemmer.stem(word=word)

In [12]:
def pre():
    # Reading training and test files to list data structures
    data = pd.read_csv("Sentiment Analysis Dataset.csv", sep = "\t", index_col=False, encoding='latin-1',low_memory=False)
    df = DataFrame(data)
#     print(df['Sentiment'])
    labelCount = df.groupby(df['Sentiment']).count()
    print(labelCount)
    x = df['SentimentText'].str.replace('http\S+|www.\S+', '', case=False)
    y = df['Sentiment']
#     .astype(str)
    x = x.str.replace('[^a-zA-Z0-9-_*.]', ' ')
    x_check = [" ".join([lemmatize(word) for word in sentence.split(" ")]) for sentence in x]
    #temp_df = [filter(lambda i: i not in string.punctuation,sentence) for sentence in x_check]
    return x_check, y

In [13]:
def labelEncoding(y):
    labelEncoder = LabelEncoder()
    y_encoded = labelEncoder.fit_transform(y)
    y_encoded
    return y_encoded

In [14]:
def countVectorizer(x):
    stopset = set(stopwords.words('English'))
    vect = CountVectorizer(analyzer='word', encoding='utf-8', min_df = 0, ngram_range=(1, 1), lowercase = True, strip_accents='ascii', stop_words = stopset)
    X_vec = vect.fit_transform(x)
    return X_vec

In [15]:
def tfidfVectorizer(x):
    stopset = set(stopwords.words('English'))
    vect = TfidfVectorizer(analyzer='word', encoding='utf-8', min_df = 0, ngram_range=(1, 1), lowercase = True, strip_accents='ascii', stop_words = stopset)
    X_vec = vect.fit_transform(x)
    return X_vec

In [16]:
def splitTestTrain(X_vec, y_encoded):
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y_encoded, 
													test_size=0.5, random_state=0)
    return X_train, X_test, y_train, y_test

In [17]:
def plotPreRec(naiveBayesRecall, naiveBayesPrecision, svmRecall, svmPrecision, randomForestRecall, randomForestPrecision, logisticRegressionRecall, logisticRegressionPrecision, sgdRecall, sgdPrecision):    
    plt.plot([naiveBayesRecall],[naiveBayesPrecision], 'ro')
    plt.plot([svmRecall],[svmPrecision], 'ms')
    plt.plot([randomForestRecall],[randomForestPrecision], 'yo')
    plt.plot([logisticRegressionRecall],[logisticRegressionPrecision], 'go')
    plt.plot([sgdRecall],[sgdPrecision], 'xb-')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall comparison plot')
    plt.legend(['MNB', 'SVM', 'RF', 'LR', 'SGD'], loc='upper left')
    plt.show() 

In [18]:
def plotAcuuracyComaprisonGraph(naiveBayesFMeasure, svmFMeasure, randomForestFMeasure, logisticRegressionFMeasure, sgdFMeasure):
#Accuracy Comparison Plot
    cl = ('MNB', 'SVC', 'RF', 'LR', 'SGD')
    y_pos = np.arange(len(cl))
    acc = [77.2682926829,79.0243902439,76.8780487805,80.6829268293,75.3170731707]
    plt.bar(y_pos, acc, align='center', alpha=0.5)
    plt.xticks(y_pos, cl)
    plt.title('Accuracy Comparison Plot')
    plt.show()
    cl = ('MNB', 'SVC', 'RF', 'LR', 'SGD')
    y_pos = np.arange(len(cl))
    acc = [naiveBayesFMeasure, svmFMeasure, randomForestFMeasure, logisticRegressionFMeasure, sgdFMeasure]
    plt.bar(y_pos, acc, align='center', alpha=1.0)
    plt.xticks(y_pos, cl)
    plt.title('F Measure Comparison Plot')
    plt.show()

In [19]:
def applyNaiveBayesClassifier(X_train, y_train, X_test, y_test):
    # Thanks to sklearn, let us quickly train some multinomial models
    # Model Training: Multinomial Naive Bayes
        mnb_classifier = MultinomialNB()
        mnb_classifier.fit(X_train, y_train)
        model_accuracies = cross_val_score(estimator=mnb_classifier, 
                                       X=X_train, y=y_train, cv=10)
        print("Model Accuracies Mean", model_accuracies.mean()*100)
        print("Model Accuracies Standard Devision", model_accuracies.std()*100)
    # Model Testing: Multinomial Naive Bayes
        y_pred = mnb_classifier.predict(X_test)
        metrics.confusion_matrix(y_test, y_pred)
        test_accuracy = metrics.accuracy_score(y_test, y_pred)
        precision_mnb = precision_score(y_test, y_pred, average='macro')  
        recall_mnb = recall_score(y_test, y_pred, average='macro') 
        f_mnb = 2*(precision_mnb*recall_mnb)/(precision_mnb+recall_mnb)
        print("Multinomial Naive Bayes Classifier Test Accuracy: ", test_accuracy*100)
        print("Multinomial Naive Bayes Classifier Test Precision: ", precision_mnb*100)
        print("Multinomial Naive Bayes Classifier Test Recall: ", recall_mnb*100)
        print("Multinomial Naive Bayes Classifier Test F measure: ", f_mnb*100)
        return precision_mnb, recall_mnb, f_mnb

In [20]:
def applySVMClassifier(X_train, y_train, X_test, y_test):
    # Model Training: SVMs
    svc_classifier = SVC(kernel='linear', random_state=0)
    svc_classifier.fit(X_train, y_train)
    model_accuracies = cross_val_score(estimator=svc_classifier, 
                                   X=X_train, y=y_train, cv=10) 
    print("Model Accuracies Mean", model_accuracies.mean()*100)
    print("Model Accuracies Standard Devision", model_accuracies.std()*100)
    # Model Testing: SVMs
    y_pred = svc_classifier.predict(X_test)
    metrics.confusion_matrix(y_test, y_pred)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)
    precision_SVC = precision_score(y_test, y_pred, average='macro')  
    recall_SVC = recall_score(y_test, y_pred, average='macro') 
    f_SVC = 2*(precision_SVC * recall_SVC) / (precision_SVC + recall_SVC)
    print("SVCs Test Accuracy: ", test_accuracy*100)
    print("SVCs Test Precision: ", precision_SVC*100)
    print("SVCs Test Recall: ", recall_SVC*100)
    print("SVCs Test F measure: ", f_SVC*100)
    return precision_SVC, recall_SVC, f_SVC

In [21]:
def applyRandomForestClassifier(X_train, y_train, X_test, y_test):
    # Model Training: Random Forests Classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, class_weight="balanced",
                                        criterion='entropy', random_state=1)
    rf_classifier.fit(X_train, y_train)
    model_accuracies = cross_val_score(estimator=rf_classifier, 
                                   X=X_train, y=y_train, cv=5) 
    print("Model Accuracies Mean", model_accuracies.mean()*100)
    print("Model Accuracies Standard Devision", model_accuracies.std()*100)
    # Model Testing: Random Forests Classifier
    y_pred = rf_classifier.predict(X_test)
    metrics.confusion_matrix(y_test, y_pred)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)
    precision_RF = precision_score(y_test, y_pred, average='macro')  
    recall_RF = recall_score(y_test, y_pred, average='macro') 
    f_RF = 2*(precision_RF * recall_RF) / (precision_RF + recall_RF)
    print("Random Forests Test Accuracy: ", test_accuracy*100)
    print("Random Forests Test Precision: ", precision_RF*100)
    print("Random Forests Test Recall: ", recall_RF*100)
    print("Random Forests Test F measure: ", f_RF*100)
    return precision_RF, recall_RF, f_RF

In [22]:
def applyLogisticRegressionClassifier(X_train, y_train, X_test, y_test):
    #Apply Logistic Regression Classifier
    lr = LogisticRegression(penalty = 'l2', C = 1)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    metrics.confusion_matrix(y_test, y_pred)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)
    precision_LR = precision_score(y_test, y_pred, average='macro')  
    recall_LR = recall_score(y_test, y_pred, average='macro') 
    f_LR = 2*(precision_LR * recall_LR) / (precision_LR + recall_LR)
    print("LogisticRegression_classifier Accuracy percent:",test_accuracy *100)
    print("LogisticRegression_classifier Precision percent:",precision_LR *100)
    print("LogisticRegression_classifier Recall percent:",recall_LR *100)
    print("LogisticRegression_classifier F measure:",f_LR *100)
    return precision_LR, recall_LR, f_LR

In [23]:
def applySGDClassifier(X_train, y_train, X_test, y_test):
    #Apply SGD Classifier
    SGDClassifier_classifier = SGDClassifier()
    SGDClassifier_classifier.fit(X_train, y_train)
    y_pred = SGDClassifier_classifier.predict(X_test)
    metrics.confusion_matrix(y_test, y_pred)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)
    precision_SGD = precision_score(y_test, y_pred, average='macro')  
    recall_SGD = recall_score(y_test, y_pred, average='macro')
    f_SGD = 2*(precision_SGD * recall_SGD) / (precision_SGD + recall_SGD)
    print("SGD_classifier Accuracy percent:",test_accuracy *100)
    print("SGD_classifier Precision percent:",precision_SGD *100)
    print("SGD_classifier Recall percent:",recall_SGD *100)
    print("SGD_classifier Recall F measure:",f_SGD *100)
    return precision_SGD, recall_SGD, f_SGD

In [24]:
def applyDecisionTreeClassifier(X_train, y_train, X_test, y_test):
    #Apply Decision Tree Classifier
    Decision_Tree_CLF = DecisionTreeClassifier(random_state=0)
    Decision_Tree_CLF.fit(X_train, y_train)
    y_pred = Decision_Tree_CLF.predict(X_test)
    metrics.confusion_matrix(y_test, y_pred)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)
    precision_DT = precision_score(y_test, y_pred, average='macro')  
    recall_DT = recall_score(y_test, y_pred, average='macro')
    f_DT = 2*(precision_DT * recall_DT) / (precision_DT + recall_DT)
    print("SGD_classifier Accuracy percent:",test_accuracy *100)
    print("SGD_classifier Precision percent:",precision_DT *100)
    print("SGD_classifier Recall percent:",recall_DT *100)
    print("SGD_classifier Recall F measure:",f_DT *100)
    return precision_DT, recall_DT, f_DT

In [25]:
def resolveOverSampling(X_train, y_train, X_test, y_test):
    # Over Sampling
    sm = SMOTE(random_state=0, ratio = 1.0)
    x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    x_test_res, y_test_res = sm.fit_sample(X_test, y_test)
    
def resolveUnderSampling(X_train, y_train, X_test, y_test):
    # Under Sampling
    rus = RandomUnderSampler(random_state=42)
    x_train_res, y_train_res= rus.fit_sample(X_train, y_train)
    x_test_res, y_test_res = rus.fit_sample(X_test, y_test)

In [26]:
def plotLabels(y):
    #Encoding y
    y_encoded = labelEncoding(y)
    #Count Labels and plot them
    y_count = Counter(y_encoded)
    key = y_count.keys()
    df = pd.DataFrame(y_count,index=key)
    df.drop(df.columns[1:], inplace=True)
    df.plot(kind='bar')
    plt.show()