In [1]:
import sys
!{sys.executable} -m pip install -U imbalanced-learn

/bin/sh: 1: {sys.executable}: not found


In [1]:
import sys
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import KFold, StratifiedKFold, cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

pd.options.display.max_colwidth = 100

## Loading the data

In [2]:
pos_df = pd.read_csv("./CR/txt/pos.tok", sep="\n", header=None)
neg_df = pd.read_csv("./CR/txt/neg.tok", sep="\n", header=None)
pos_df.columns = ["text"]
pos_df["sentiment"] = 1
neg_df.columns = ["text"]
neg_df["sentiment"] = 0

In [3]:
pos_df

Unnamed: 0,text,sentiment
0,im a more happier person after discovering the i/p button ! .,1
1,"but , if you 're looking for my opinion of the apex dvd player , i love it ! .",1
2,it practically plays almost everything you give it .,1
3,for the price it is a well spent investment ! .,1
4,"this is by far the nicest one , in so many ways .",1
...,...,...
2400,"i installed , activated and registered nis 2004 without issue .",1
2401,"so far , the anti-spam feature seems to be very good .",1
2402,i downloaded a trial version of computer associates ex firewall and antivirus and fell in love w...,1
2403,i did not have any of the installation problems that people highlighted here .,1


In [4]:
df = pos_df.append(neg_df, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
df

Unnamed: 0,text,sentiment
0,"these tasks are faster and easier than in prior versions , and i found the menu to be much more ...",1
1,"apparently , t-mobile is heavily back-logged and can 't keep up with demands .",0
2,"the auto-focus performs well , but i love having the 12 optional scene modes - they are dummy-pr...",1
3,"the card basically installed itself and the "" install wizard "" on the cd that came with the rout...",1
4,"( it sure does look nice , though ! . ) i recommend this to any music lover anywhere : the quali...",1
...,...,...
3766,2 ) storage capacity,1
3767,you can 't do that with a sony t610 ( which i traded for a 6600 ) .,0
3768,so much packed in a small case and very affordable ! .,1
3769,this router was a huge disappointment .,0


In [6]:
ct = df['sentiment'].value_counts().reset_index()
ct.columns = ['sentiment', 'count']
print(ct)

# Too many positives, have to remove some

# count_1 = 0
# for index, row in df.iterrows():
#     if row['sentiment'] == 1:
#         count_1 += 1
#     if row['sentiment'] == 1 and count_1 > 1366:
#         df = df.drop(index)

# ct1 = df['sentiment'].value_counts().reset_index()
# ct1.columns = ['sentiment', 'count']
# print(ct1)

   sentiment  count
0          1   2405
1          0   1366


https://stanford.edu/~jurafsky/slp3/4.pdf

https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [7]:
def train_cross(X, Y, estimator):
    scores = cross_validate(estimator, X, Y, cv=10, scoring=('accuracy', 'f1', 'precision', 'recall'), return_estimator=True)
    estimators = scores["estimator"]
    best_accuracy = 0.0
    best_f1 = 0.0
    best = None

    line = "Accuracy: "
    for i in range(len(scores["test_accuracy"])):
        s = scores["test_accuracy"][i]
        line += "\t %0.2f" % s
        if best_accuracy < s:
            best = estimators[i]
            best_accuracy = s
    print(line)

    line = "F1:\t"
    for s in scores["test_f1"]:
        line += "\t %0.2f" % s
        if best_f1 < s:
            best_f1 = s
    print(line)
    print()
    print("Best test accuracy: %f" % best_accuracy)
    print("Best test f1: %f\n" % best_f1)
    y_pred = best.predict(X)

    print(classification_report(Y, y_pred, target_names=["Negative", "Positive"]))


# A Naïve Bayes classifier with add-1 smoothing using binary bag-of-words features

In [8]:
feature = df['text'].to_numpy()
labels = df['sentiment'].to_numpy()

binBagCount = CountVectorizer(max_features=1000, binary=True)
binBagFeatures = binBagCount.fit_transform(feature)

binBagNGramVect = CountVectorizer(ngram_range=(1, 2), max_features=1000, binary=True) # ngram_range=(1, 2) is what makes it ngram
binBagNGramFeatures = binBagNGramVect.fit_transform(feature)# A Naïve Bayes classifier with add-1 smoothing using binary bag-of-words features

In [9]:
nb = MultinomialNB(alpha=1.0) # alpha=1.0 is for add-one smoothing

train_cross(binBagFeatures, labels, nb)  

Accuracy: 	 0.77	 0.81	 0.78	 0.79	 0.78	 0.79	 0.80	 0.80	 0.76	 0.76
F1:		 0.82	 0.86	 0.82	 0.84	 0.83	 0.84	 0.85	 0.84	 0.81	 0.81

Best test accuracy: 0.814815
Best test f1: 0.855372

              precision    recall  f1-score   support

    Negative       0.78      0.75      0.76      1366
    Positive       0.86      0.88      0.87      2405

    accuracy                           0.83      3771
   macro avg       0.82      0.81      0.82      3771
weighted avg       0.83      0.83      0.83      3771



# A Naïve Bayes classifier with add-1 smoothing using binary bag-of-ngrams features (with unigrams and bigrams)

In [10]:
nb = MultinomialNB(alpha=1.0) # alpha=1.0 is for add-one smoothing

train_cross(binBagNGramFeatures, labels, nb)

Accuracy: 	 0.78	 0.81	 0.79	 0.80	 0.77	 0.80	 0.79	 0.79	 0.75	 0.74
F1:		 0.83	 0.84	 0.83	 0.84	 0.82	 0.84	 0.84	 0.83	 0.80	 0.80

Best test accuracy: 0.809524
Best test f1: 0.844898

              precision    recall  f1-score   support

    Negative       0.75      0.78      0.77      1366
    Positive       0.87      0.85      0.86      2405

    accuracy                           0.83      3771
   macro avg       0.81      0.82      0.81      3771
weighted avg       0.83      0.83      0.83      3771



# Logistic Regression classifier with L2 regularization (and default parameters) using binary bag-of-words features

In [11]:
clf = LogisticRegression(random_state=0, solver='liblinear', 
                             multi_class='ovr', penalty='l2')

train_cross(binBagFeatures, labels, clf)

Accuracy: 	 0.79	 0.80	 0.78	 0.78	 0.78	 0.76	 0.77	 0.80	 0.76	 0.75
F1:		 0.84	 0.84	 0.84	 0.83	 0.83	 0.82	 0.82	 0.85	 0.82	 0.81

Best test accuracy: 0.800532
Best test f1: 0.847251

              precision    recall  f1-score   support

    Negative       0.87      0.79      0.83      1366
    Positive       0.89      0.93      0.91      2405

    accuracy                           0.88      3771
   macro avg       0.88      0.86      0.87      3771
weighted avg       0.88      0.88      0.88      3771



# Logistic Regression classifier with L2 regularization using binary bag-of-ngrams features (with unigrams and bigrams)

In [12]:
clf = LogisticRegression(random_state=0, solver='liblinear', 
                             multi_class='ovr', penalty='l2')

train_cross(binBagNGramFeatures, labels, clf)

Accuracy: 	 0.73	 0.80	 0.78	 0.80	 0.78	 0.77	 0.76	 0.77	 0.74	 0.74
F1:		 0.80	 0.85	 0.83	 0.85	 0.83	 0.83	 0.81	 0.82	 0.80	 0.80

Best test accuracy: 0.801587
Best test f1: 0.846626

              precision    recall  f1-score   support

    Negative       0.87      0.80      0.83      1366
    Positive       0.89      0.93      0.91      2405

    accuracy                           0.88      3771
   macro avg       0.88      0.86      0.87      3771
weighted avg       0.88      0.88      0.88      3771



## Final Conclusion

Given the classification reports from each one of the four exercises, we can see that all of them have very similar accuracy and F1-measure (all accuracy around 0.8 and F1 around 0.85). However, the Naïve Bayes classifier with add-1 smoothing using binary bag-of-words features (first question) seems to have the highest accuracy of 0.815 and F1-measure of 0.855.