# NLP: Sentiment classification for a movie data set

# Loading data 

In [1]:
import pandas as pd

In [2]:
import json
from collections import Counter


# Loading json
with open("ressources/json_pol",encoding="utf-8") as f:
    data = f.readlines()
    json_data = json.loads(data[0])
    train = json_data["train"]
    test = json_data["test"]

# Quick Check
counter_train = Counter((x[1] for x in train))
counter_test = Counter((x[1] for x in test))
print("Number of train reviews : ", len(train))
print("----> # of positive : ", counter_train[1])
print("----> # of negative : ", counter_train[0])
print("")
print(train[0])
print("")
print("Number of test reviews : ",len(test))
print("----> # of positive : ", counter_test[1])
print("----> # of negative : ", counter_test[0])

print("")
print(test[0])
print("")

Number of train reviews :  25000
----> # of positive :  12500
----> # of negative :  12500

["The undoubted highlight of this movie is Peter O'Toole's performance. In turn wildly comical and terribly terribly tragic. Does anybody do it better than O'Toole? I don't think so. What a great face that man has!<br /><br />The story is an odd one and quite disturbing and emotionally intense in parts (especially toward the end) but it is also oddly touching and does succeed on many levels. However, I felt the film basically revolved around Peter O'Toole's luminous performance and I'm sure I wouldn't have enjoyed it even half as much if he hadn't been in it.", 1]

Number of test reviews :  25000
----> # of positive :  12500
----> # of negative :  12500

['Although credit should have been given to Dr. Seuess for stealing the story-line of "Horton Hatches The Egg", this was a fine film. It touched both the emotions and the intellect. Due especially to the incredible performance of seven year old 

In [3]:
# From TRAIN data set
classes = [pol for text,pol in train] # y_train
corpus = [text for text,pol in train] # X_train

# From TEST data set
true = [pol for text,pol in test] #y_test
test_corpus = [text for text,pol in test] #X_test

# Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer


dic_bagOfWords = {
    "default": [CountVectorizer()],
    "stopwords": [CountVectorizer(stop_words='english')],
    "rm frequent": [CountVectorizer(max_df=0.9)],
    "rm rare": [CountVectorizer(min_df=0.1)], 
    "bigram": [CountVectorizer(ngram_range=(2,2))],
    "uni and bigram": [CountVectorizer(ngram_range=(1,2))]
}

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


def eval_preprocessing(train_corpus=corpus, test_corpus=test_corpus, train_classes=classes, test_classes=true, dic_levels=dic_bagOfWords):
    for title, vecto in list(dic_levels.items()):
        vectorizer = vecto[0]
        X = vectorizer.fit_transform(train_corpus)
        
        #Naïve Bayes
        nb_clf = MultinomialNB()
        nb_clf.fit(X, train_classes)

        #Logistic Regression
        lr_clf = LogisticRegression(random_state=0, solver='lbfgs',n_jobs=-1)
        lr_clf.fit(X, train_classes)

        #Linear SVM
        svm_clf = LinearSVC(random_state=0, tol=1e-5)
        svm_clf.fit(X, train_classes)

        X_test = vectorizer.transform(test_corpus)

        pred_nb = nb_clf.predict(X_test)
        pred_lr = lr_clf.predict(X_test)
        pred_svm = svm_clf.predict(X_test)

        acc_nb = accuracy_score(test_classes, pred_nb)
        acc_lr = accuracy_score(test_classes, pred_lr)
        acc_svm = accuracy_score(test_classes, pred_svm)

        print(f"Naïve Bayes accuracy for {title}: {acc_nb}")
        print(f"Logistic Regression accuracy for {title}: {acc_lr}")
        print(f"SVM accuracy for {title}: {acc_svm}")

        dic_levels[title].extend([acc_nb, acc_lr, acc_svm])

    return dic_levels

# Evaluating Bag of words

In [6]:
eval_bagOfWords = eval_preprocessing()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for default: 0.81356
Logistic Regression accuracy for default: 0.86392
SVM accuracy for default: 0.84576


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for stopwords: 0.81968
Logistic Regression accuracy for stopwords: 0.85776
SVM accuracy for stopwords: 0.83468


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for rm frequent: 0.81484
Logistic Regression accuracy for rm frequent: 0.86264
SVM accuracy for rm frequent: 0.8448


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for rm rare: 0.71804
Logistic Regression accuracy for rm rare: 0.77152
SVM accuracy for rm rare: 0.76488


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for bigram: 0.87016
Logistic Regression accuracy for bigram: 0.88028
SVM accuracy for bigram: 0.87576


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for uni and bigram: 0.85692
Logistic Regression accuracy for uni and bigram: 0.89644
SVM accuracy for uni and bigram: 0.8912


In [8]:
eval_bow = pd.DataFrame(eval_bagOfWords)
eval_bow.loc[:, 'index'] = ["model", 'naiveBayes', 'logisticRegression', 'SVM']
eval_bow = eval_bow.set_index('index')
eval_bow

Unnamed: 0_level_0,default,stopwords,rm frequent,rm rare,bigram,uni and bigram
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
model,CountVectorizer(),CountVectorizer(stop_words='english'),CountVectorizer(max_df=0.9),CountVectorizer(min_df=0.1),"CountVectorizer(ngram_range=(2, 2))","CountVectorizer(ngram_range=(1, 2))"
naiveBayes,0.81356,0.81968,0.81484,0.71804,0.87016,0.85692
logisticRegression,0.86392,0.85776,0.86264,0.77152,0.88028,0.89644
SVM,0.84576,0.83468,0.8448,0.76488,0.87576,0.8912


In [10]:
dic_bagOfWords_v2 = {
    "default": [CountVectorizer()],
    "rm frequent": [CountVectorizer(max_df=0.9)],
    "bigram": [CountVectorizer(ngram_range=(2,2))],
    "uni and bigram": [CountVectorizer(ngram_range=(1,2))],
    'trigram': [CountVectorizer(ngram_range=(3,3))],
    'uni to trigram': [CountVectorizer(ngram_range=(1,3))],
    'bi to trigram': [CountVectorizer(ngram_range=(2,3))]
}

In [11]:
eval_bagOfWords_v2 = eval_preprocessing(dic_levels=dic_bagOfWords_v2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for default: 0.81356
Logistic Regression accuracy for default: 0.86392
SVM accuracy for default: 0.84576


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for rm frequent: 0.81484
Logistic Regression accuracy for rm frequent: 0.86264
SVM accuracy for rm frequent: 0.8448


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for bigram: 0.87016
Logistic Regression accuracy for bigram: 0.88028
SVM accuracy for bigram: 0.87576


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for uni and bigram: 0.85692
Logistic Regression accuracy for uni and bigram: 0.89644
SVM accuracy for uni and bigram: 0.8912




Naïve Bayes accuracy for trigram: 0.87284
Logistic Regression accuracy for trigram: 0.84256
SVM accuracy for trigram: 0.8434


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for uni to trigram: 0.87296
Logistic Regression accuracy for uni to trigram: 0.89828
SVM accuracy for uni to trigram: 0.89624


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Naïve Bayes accuracy for bi to trigram: 0.88032
Logistic Regression accuracy for bi to trigram: 0.87948
SVM accuracy for bi to trigram: 0.88128


In [14]:
eval_bow_v2 = pd.DataFrame(eval_bagOfWords_v2)
eval_bow_v2.loc[:, 'index'] = ["model", 'naiveBayes', 'logisticRegression', 'SVM']
eval_bow_v2 = eval_bow_v2.set_index('index')
eval_bow_v2

Unnamed: 0_level_0,default,rm frequent,bigram,uni and bigram,trigram,uni to trigram,bi to trigram
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
model,CountVectorizer(),CountVectorizer(max_df=0.9),"CountVectorizer(ngram_range=(2, 2))","CountVectorizer(ngram_range=(1, 2))","CountVectorizer(ngram_range=(3, 3))","CountVectorizer(ngram_range=(1, 3))","CountVectorizer(ngram_range=(2, 3))"
naiveBayes,0.81356,0.81484,0.87016,0.85692,0.87284,0.87296,0.88032
logisticRegression,0.86392,0.86264,0.88028,0.89644,0.84256,0.89828,0.87948
SVM,0.84576,0.8448,0.87576,0.8912,0.8434,0.89624,0.88128
