In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')

In [3]:
df = pd.read_csv('clean_tweets.csv')

In [4]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['sentiment'] = df['sentiment'].map({0:0,4:1})

In [5]:
df.dropna(inplace=True)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x = df['text'] #define all other columns except the target variable
y = df['sentiment'] #define the target variable

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size = 0.02, random_state = 42)

x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, 
                                                              test_size = 0.5, random_state = 42)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import FeatureHasher
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from time import time
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
null_accuracy = 0
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test==0])/len(x_test)>0.5:
        null_accuracy = len(x_test[y_test==0])/len(x_test)
    else:
        null_accuracy = 1 - len(x_test[y_test==0])/len(x_test)
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    report=classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Null accuracy: {0:.2f}%".format(null_accuracy*100))
    print("Accuracy: {0:.2f}%".format(accuracy*100))
    if accuracy>null_accuracy:
        print("Model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100))
    elif accuracy==null_accuracy:
        print("Model has the same accuracy as null accuracy")
    else:
        print("Model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100))
    print("Train and test time: {0:.2f}s".format(train_test_time))
    print("-"*50)
    print(report)
    return accuracy, train_test_time

In [10]:
tfidf = TfidfVectorizer()
lr = LogisticRegression()
n_features = np.arange(10000, 100001, 10000)

In [11]:
def nfeature_accuracy_checker(vectorizer = tfidf, n_features = n_features, stop_words = None, 
                              ngram_range = (1,1), classifier = lr):
    result = []
    print(classifier, "\n")
    for n in n_features:
        vectorizer.set_params(stop_words = stop_words, max_features = n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])
        print("Validation result for {0} features".format(n))
        nfeature_accuracy, ttime = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
        result.append((n, nfeature_accuracy, ttime))    
    return result

In [12]:
%%time
print("RESULT FOR UNIGRAM\n")
feature_result_ug_t = nfeature_accuracy_checker()

RESULT FOR UNIGRAM

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) 

Validation result for 10000 features




Null accuracy: 50.18%
Accuracy: 79.58%
Model is 29.39% more accurate than null accuracy
Train and test time: 84.98s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      8013
           1       0.79      0.80      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 20000 features




Null accuracy: 50.18%
Accuracy: 79.86%
Model is 29.68% more accurate than null accuracy
Train and test time: 83.25s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      8013
           1       0.79      0.80      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 30000 features




Null accuracy: 50.18%
Accuracy: 79.90%
Model is 29.72% more accurate than null accuracy
Train and test time: 98.99s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      8013
           1       0.79      0.80      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 40000 features




Null accuracy: 50.18%
Accuracy: 79.99%
Model is 29.81% more accurate than null accuracy
Train and test time: 101.29s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 50000 features




Null accuracy: 50.18%
Accuracy: 80.05%
Model is 29.86% more accurate than null accuracy
Train and test time: 93.24s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 60000 features




Null accuracy: 50.18%
Accuracy: 80.03%
Model is 29.85% more accurate than null accuracy
Train and test time: 96.67s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 70000 features




Null accuracy: 50.18%
Accuracy: 80.06%
Model is 29.87% more accurate than null accuracy
Train and test time: 108.02s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 80000 features




Null accuracy: 50.18%
Accuracy: 80.04%
Model is 29.86% more accurate than null accuracy
Train and test time: 99.93s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 90000 features




Null accuracy: 50.18%
Accuracy: 80.11%
Model is 29.92% more accurate than null accuracy
Train and test time: 99.97s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Validation result for 100000 features




Null accuracy: 50.18%
Accuracy: 80.08%
Model is 29.89% more accurate than null accuracy
Train and test time: 112.20s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      8013
           1       0.80      0.81      0.80      7954

    accuracy                           0.80     15967
   macro avg       0.80      0.80      0.80     15967
weighted avg       0.80      0.80      0.80     15967

Wall time: 16min 18s


In [13]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

In [14]:
names = ['Ridge Classifier', 'Logistic Regression', 'Perceptron', 'Passive-Agressive Classifier', 'Stochastic Gradient Descent',
         'LinearSVC', 'L1 based LinearSVC', 'KNN', 'Nearest Centroid', 'Multinomial NB', 'Bernoulli NB', 'Adaboost']
classifiers = [
    RidgeClassifier(),
    LogisticRegression(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    SGDClassifier(),
    LinearSVC(),
    Pipeline([
        ('feature_selection', SelectFromModel(LinearSVC(penalty='l1', dual=False))),
        ('classification', LinearSVC(penalty='l2'))
    ]),
    KNeighborsClassifier(),
    NearestCentroid(),
    MultinomialNB(),
    BernoulliNB(),
    AdaBoostClassifier(),
]

In [15]:
zipped_clf = zip(names, classifiers)
tfidf = TfidfVectorizer()

In [16]:
def classifier_comparator(vectorizer = tfidf, n_features=10000, stop_words=None, ngram_range=(1,1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, ngram_range=ngram_range, max_features=n_features)
    for n, c in classifier:
        pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', c)])
        print('Validation result for {}'.format(n), c)
        clf_accuracy, ttime = accuracy_summary(pipeline, x_train, y_train, x_validation, y_validation)
        
        result.append((n, clf_accuracy, ttime))
    return result

In [17]:
bigram_comparison = classifier_comparator(n_features=90000, ngram_range=(1,2))

Validation result for Ridge Classifier RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)
Null accuracy: 50.18%
Accuracy: 82.29%
Model is 32.10% more accurate than null accuracy
Train and test time: 177.09s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      8013
           1       0.81      0.84      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for Logistic Regression LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   rando



Null accuracy: 50.18%
Accuracy: 82.43%
Model is 32.24% more accurate than null accuracy
Train and test time: 178.62s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      8013
           1       0.82      0.83      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for Perceptron Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)
Null accuracy: 50.18%
Accuracy: 76.39%
Model is 26.20% more accurate than null accuracy
Train and test time: 113.19s
--------------------------------------------------
              prec



Null accuracy: 50.18%
Accuracy: 82.26%
Model is 32.08% more accurate than null accuracy
Train and test time: 838.67s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      8013
           1       0.83      0.81      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for L1 based LinearSVC Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     loss='squared_hinge',
                                      



Null accuracy: 50.18%
Accuracy: 82.41%
Model is 32.22% more accurate than null accuracy
Train and test time: 989.29s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.83      0.83      8013
           1       0.83      0.81      0.82      7954

    accuracy                           0.82     15967
   macro avg       0.82      0.82      0.82     15967
weighted avg       0.82      0.82      0.82     15967

Validation result for KNN KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Null accuracy: 50.18%
Accuracy: 62.55%
Model is 12.37% more accurate than null accuracy
Train and test time: 1872.75s
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.69      0.45      0.55      8013
           1  