In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
#-------------------------------
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg', disable = ["tagger", "parser", "ner"])

In [3]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

In [15]:
train_data['sentiment'].value_counts()

negative    12500
positive    12500
Name: sentiment, dtype: int64

In [10]:
print(train_data[12500:12505].to_latex(index=False))

\begin{tabular}{ll}
\toprule
                                            review & sentiment \\
\midrule
 Bromwell High is a cartoon comedy. It ran at t... &  positive \\
 Homelessness (or Houselessness as George Carli... &  positive \\
 Brilliant over-acting by Lesley Ann Warren. Be... &  positive \\
 This is easily the most underrated film inn th... &  positive \\
 This is not the typical Mel Brooks film. It wa... &  positive \\
\bottomrule
\end{tabular}



# Using CountVectorizer

### CountVectorizer + MultinomialNB

In [4]:
pipe_count_mnb = Pipeline([("vectorizer", CountVectorizer()), ("mnb", MultinomialNB())], verbose=True)
pipe_count_mnb.fit(train_data['review'], train_data['sentiment'])
pred_count_mnb = pipe_count_mnb.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_count_mnb, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  10.6s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.1s
              precision    recall  f1-score   support

    negative       0.78      0.88      0.82     12500
    positive       0.86      0.75      0.80     12500

    accuracy                           0.81     25000
   macro avg       0.82      0.81      0.81     25000
weighted avg       0.82      0.81      0.81     25000



### CountVectorizer + LogisticRegression

In [8]:
pipe_count_logreg = Pipeline([("vectorizer", CountVectorizer()), ("logreg", LogisticRegression(random_state=1234))], verbose=True)
pipe_count_logreg.fit(train_data['review'], train_data['sentiment'])
pred_count_logreg = pipe_count_logreg.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_count_logreg, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  11.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Pipeline] ............ (step 2 of 2) Processing logreg, total=   6.2s
              precision    recall  f1-score   support

    negative       0.86      0.87      0.87     12500
    positive       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



### CountVectorizer + LinearSVC

In [12]:
pipe_count_SVC = Pipeline([("vectorizer", CountVectorizer()), ("SVC", LinearSVC(random_state=1234))], verbose=True)
pipe_count_SVC.fit(train_data['review'], train_data['sentiment'])
pred_count_SVC = pipe_count_SVC.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_count_SVC, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   9.8s
[Pipeline] ............... (step 2 of 2) Processing SVC, total=   9.0s
              precision    recall  f1-score   support

    negative       0.84      0.86      0.85     12500
    positive       0.85      0.84      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



# Using TfidfVectorizer

### TfidfVectorizer + MultinomialNB

In [16]:
pipe_tfidf_mnb = Pipeline([("vectorizer", TfidfVectorizer()), ("mnb", MultinomialNB())], verbose=True)
pipe_tfidf_mnb.fit(train_data['review'], train_data['sentiment'])
pred_tfidf_mnb = pipe_tfidf_mnb.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_tfidf_mnb, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  10.1s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.1s
              precision    recall  f1-score   support

    negative       0.79      0.89      0.84     12500
    positive       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



#### TfidfVectorizer + LogisticRegression

In [17]:
pipe_tfidf_logreg = Pipeline([("vectorizer", TfidfVectorizer()), ("logreg", LogisticRegression(random_state=1234))], verbose=True)
pipe_tfidf_logreg.fit(train_data['review'], train_data['sentiment'])
predC_tfidf_logreg = pipe_tfidf_logreg.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predC_tfidf_logreg, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  10.4s
[Pipeline] ............ (step 2 of 2) Processing logreg, total=   3.4s
              precision    recall  f1-score   support

    negative       0.88      0.88      0.88     12500
    positive       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



### TfidfVectorizer + SVC

In [18]:
pipe_tfidf_SVC = Pipeline([("vectorizer", TfidfVectorizer()), ("SVC", LinearSVC(random_state=1234))], verbose=True)
pipe_tfidf_SVC.fit(train_data['review'], train_data['sentiment'])
pred_tfidf_SVC = pipe_tfidf_SVC.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_tfidf_SVC, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   9.4s
[Pipeline] ............... (step 2 of 2) Processing SVC, total=   0.8s
              precision    recall  f1-score   support

    negative       0.87      0.89      0.88     12500
    positive       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



# Using MeanSentenceVectorizer

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
class MeanSentenceVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def tokenizer(self, sentence):
        doc = nlp(sentence)
        preprocessed = [token.text for token in doc]
        return preprocessed
    
    def transform(self, X):
        return np.array(
            [np.mean([nlp.vocab[word].vector for word in self.tokenizer(sentence)], axis=0) for sentence in tqdm(X)]
            )

### MeanSentenceVectorizer + LogisticRegression

In [32]:
pipe_msv_logreg = Pipeline([("vectorizer", MeanSentenceVectorizer()), ("logreg", LogisticRegression(random_state=1234))], verbose=True)
pipe_msv_logreg.fit(train_data['review'], train_data['sentiment'])
pred_ms_logreg = pipe_msv_logreg.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_msv_logreg, target_names = train_data['sentiment'].unique()))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))


[Pipeline] ........ (step 1 of 2) Processing vectorizer, total= 1.3min
[Pipeline] ............ (step 2 of 2) Processing logreg, total=   1.7s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))


              precision    recall  f1-score   support

    negative       0.84      0.86      0.85     12500
    positive       0.85      0.84      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



### MeanSentenceVectorizer + SVC

In [33]:
pipe_msv_SVC = Pipeline([("vectorizer", MeanSentenceVectorizer()), ("SVC", LinearSVC(random_state=1234))], verbose=True)
pipe_msv_SVC.fit(train_data['review'], train_data['sentiment'])
pred_msv_SVC = pipe_msv_SVC.predict(test_data['review'])

print(classification_report(test_data['sentiment'], pred_msv_SVC, target_names = train_data['sentiment'].unique()))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))


[Pipeline] ........ (step 1 of 2) Processing vectorizer, total= 1.2min
[Pipeline] ............... (step 2 of 2) Processing SVC, total=  10.7s


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))


              precision    recall  f1-score   support

    negative       0.85      0.86      0.85     12500
    positive       0.86      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



# Using DistilBERT

In [34]:
# code for processing sentence to sentence embedding
""" 
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

###############################
# Slice the data for trials
r = int(1/2 * len(train_data))

tr_d = pd.concat([train_data[:r], train_data[12500:(12500+r)]])
te_d = pd.concat([test_data[:r], test_data[12500:(12500+r)]])
###############################

def embed_sentences(X, maxlen=512):
    # tokenize sentences
    tokenized = []
    for sentence in tqdm(X, desc='Tokenizing sentences'):
        token_vec = tokenizer(sentence, return_tensors="tf", truncation=True, padding='max_length', max_length=maxlen)['input_ids']
        tokenized.append(token_vec)
    tokenized = tf.convert_to_tensor(tf.squeeze(tokenized))

    # embedding sentences
    vecs = []
    batches = list(tf.split(tokenized, 5000))
    for batch in tqdm(batches, desc='Processing sentences'):
        lhs = model(batch).last_hidden_state[:,0,:]
        vecs.append(lhs)
    return np.concatenate(vecs)

X_train = embed_sentences(tr_d['review'])
np.save('X_train', X_train)

X_test = embed_sentences(te_d['review'])
np.save('X_test', X_test)
 """

' \nfrom transformers import DistilBertTokenizer, TFDistilBertModel\nimport tensorflow as tf\n\ntokenizer = DistilBertTokenizer.from_pretrained(\'distilbert-base-uncased\')\nmodel = TFDistilBertModel.from_pretrained(\'distilbert-base-uncased\')\n\n###############################\n# Slice the data for trials\nr = int(1/2 * len(train_data))\n\ntr_d = pd.concat([train_data[:r], train_data[12500:(12500+r)]])\nte_d = pd.concat([test_data[:r], test_data[12500:(12500+r)]])\n###############################\n\ndef embed_sentences(X, maxlen=512):\n    # tokenize sentences\n    tokenized = []\n    for sentence in tqdm(X, desc=\'Tokenizing sentences\'):\n        token_vec = tokenizer(sentence, return_tensors="tf", truncation=True, padding=\'max_length\', max_length=maxlen)[\'input_ids\']\n        tokenized.append(token_vec)\n    tokenized = tf.convert_to_tensor(tf.squeeze(tokenized))\n\n    # embedding sentences\n    vecs = []\n    batches = list(tf.split(tokenized, 5000))\n    for batch in tqdm(b

In [36]:
# load vectors
X_train, X_test = np.load('X_train.npy'),  np.load('X_test.npy')

### DistilBERT + LogisticRegression

In [44]:
db_logreg = LogisticRegression(random_state=1234).fit(X_train, train_data['sentiment'])
pred_db_logreg = db_logreg.predict(X_test)
print(classification_report(test_data['sentiment'], pred_db_logreg, target_names = train_data['sentiment'].unique()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85     12500
    positive       0.85      0.84      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



### DistilBERT + SVC

In [45]:
db_SVC = LinearSVC(random_state=1234).fit(X_train, train_data['sentiment'])
pred_db_SVC = db_SVC.predict(X_test)
print(classification_report(test_data['sentiment'], pred_db_SVC, target_names = train_data['sentiment'].unique()))

              precision    recall  f1-score   support

    negative       0.85      0.85      0.85     12500
    positive       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [50]:
acc = {'MultinomialNB': [0.82, 0.83, '-', '-'],
        'LogisticRegression': [0.86, 0.88, 0.85, 0.85],
        'LinearSVC':[0.85, 0.88, 0.85, 0.85]}
acc_df = pd.DataFrame(acc, columns = ['MultinomialNB', 'LogisticRegression', 'LinearSVC'],
                        index=['CountVectorizer', 'TfidfVectorizer', 'MeanSentenceVectorizer', 'DistilBERT'])

In [51]:
acc_df

Unnamed: 0,MultinomialNB,LogisticRegression,LinearSVC
CountVectorizer,0.82,0.86,0.85
TfidfVectorizer,0.83,0.88,0.88
MeanSentenceVectorizer,-,0.85,0.85
DistilBERT,-,0.85,0.85
