In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('clean_tweets.csv')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [4]:
df.dropna(inplace=True)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from time import time

In [8]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
null_accuracy = 0
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test==0])/len(x_test)>0.5:
        null_accuracy = len(x_test[y_test==0])/len(x_test)
    else:
        null_accuracy = 1 - len(x_test[y_test==0])/len(x_test)
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    report=classification_report(y_test,y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Null accuracy: {0:.2f}%".format(null_accuracy*100))
    print("Accuracy: {0:.2f}%".format(accuracy*100))
    if accuracy>null_accuracy:
        print("Model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100))
    elif accuracy==null_accuracy:
        print("Model has the same accuracy as null accuracy")
    else:
        print("Model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100))
    print("Train and test time: {0:.2f}s".format(train_test_time))
    print("-"*50)
    print(report)
    return accuracy, train_test_time

In [10]:
cvec = CountVectorizer()
lr = LogisticRegression()
n_features = np.arange(10000, 100001, 10000)

In [11]:
def nfeature_accuracy_checker(vectorizer = cvec, n_features = n_features, stop_words = None, 
                              ngram_range = (1,1), classifier = lr):
    result = []
    print(classifier, "\n")
    for n in n_features:
        vectorizer.set_params(stop_words = stop_words, max_features = n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])
        print("Validation result for {0} features".format(n))
        nfeature_accuracy, ttime = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
        result.append((n, nfeature_accuracy, ttime))
    return result

In [12]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

In [13]:
names = ['Ridge Classifier', 'Logistic Regression', 'Perceptron', 'Passive-Agressive Classifier', 'Stochastic Gradient Descent',
         'LinearSVC', 'L1 based LinearSVC', 'KNN', 'Nearest Centroid', 'Multinomial NB', 'Bernoulli NB', 'Adaboost']
classifiers = [
    RidgeClassifier(),
    LogisticRegression(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    SGDClassifier(),
    LinearSVC(),
    Pipeline([
        ('feature_selection', SelectFromModel(LinearSVC(penalty='l1', dual=False))),
        ('classification', LinearSVC(penalty='l2'))
    ]),
    KNeighborsClassifier(),
    NearestCentroid(),
    MultinomialNB(),
    BernoulliNB(),
    AdaBoostClassifier(),
]

In [14]:
zipped_clf = zip(names, classifiers)

In [15]:
def classifier_comparator(vectorizer = cvec, n_features=10000, stop_words=None, ngram_range=(1,1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, ngram_range=ngram_range, max_features=n_features)
    for n, c in classifier:
        pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', c)])
        print('Validation result for {}'.format(n), c)
        clf_accuracy, ttime = accuracy_summary(pipeline, x_train, y_train, x_validation, y_validation)
        
        result.append((n, clf_accuracy, ttime))
    return result

In [16]:
trgram_comparison = classifier_comparator(n_features=80000, ngram_range=(1,3))

Validation result for Ridge Classifier RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)
Null accuracy: 50.18%
Accuracy: 81.86%
Model is 31.67% more accurate than null accuracy
Train and test time: 580.60s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.80      0.82      8013
           1       0.81      0.84      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for Logistic Regression LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   rando



Null accuracy: 50.18%
Accuracy: 82.38%
Model is 32.19% more accurate than null accuracy
Train and test time: 611.43s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      8013
           1       0.82      0.83      0.83      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for Perceptron Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)
Null accuracy: 50.18%
Accuracy: 75.76%
Model is 25.57% more accurate than null accuracy
Train and test time: 750.20s
--------------------------------------------------
              prec



Null accuracy: 50.18%
Accuracy: 82.06%
Model is 31.88% more accurate than null accuracy
Train and test time: 827.04s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      8013
           1       0.81      0.84      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for L1 based LinearSVC Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     loss='squared_hinge',
                                      



Null accuracy: 50.18%
Accuracy: 82.14%
Model is 31.95% more accurate than null accuracy
Train and test time: 1220.14s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      8013
           1       0.81      0.84      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for KNN KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Null accuracy: 50.18%
Accuracy: 71.87%
Model is 21.69% more accurate than null accuracy
Train and test time: 1968.04s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.75      0.66      0.70      8013
           1 

In [17]:
trgram_comparison

[('Ridge Classifier', 0.8185632867789816, 580.59814620018),
 ('Logistic Regression', 0.8237615081104779, 611.4296820163727),
 ('Perceptron', 0.757562472599737, 750.1953251361847),
 ('Passive-Agressive Classifier', 0.7593160894344586, 186.06019163131714),
 ('Stochastic Gradient Descent', 0.8138660988288344, 183.67948579788208),
 ('LinearSVC', 0.8206300494770464, 827.0392413139343),
 ('L1 based LinearSVC', 0.82138159954907, 1220.1411337852478),
 ('KNN', 0.7187323855451869, 1968.039529800415),
 ('Nearest Centroid', 0.6378154944573182, 287.850460767746),
 ('Multinomial NB', 0.7973319972443164, 251.4556438922882),
 ('Bernoulli NB', 0.7937621344022046, 259.98261547088623),
 ('Adaboost', 0.7023235423060061, 540.1976461410522)]

In [19]:
labels = ['Classifier', 'Accuracy', 'Time']
df = pd.DataFrame(trgram_comparison, columns = labels)

In [20]:
df.to_csv('trigramwithcountvectoriser.csv')