In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
#-------------------------------
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg', disable = ["tagger", "parser", "ner"])

In [3]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

In [4]:
train_data.head()

Unnamed: 0,review,sentiment
0,Story of a man who has unnatural feelings for ...,negative
1,Airport '77 starts as a brand new luxury 747 p...,negative
2,This film lacked something I couldn't put my f...,negative
3,"Sorry everyone,,, I know this is supposed to b...",negative
4,When I was little my parents took me along to ...,negative


In [5]:
test_data.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,negative
1,This is an example of why the majority of acti...,negative
2,"First of all I hate those moronic rappers, who...",negative
3,Not even the Beatles could write songs everyon...,negative
4,Brass pictures (movies is not a fitting word f...,negative


In [6]:
pipe = Pipeline([("vectorizer", TfidfVectorizer()), ("mnb", MultinomialNB())], verbose=True)
pipe.fit(train_data['review'], train_data['sentiment'])
predClass = pipe.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   9.9s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.1s
              precision    recall  f1-score   support

    negative       0.79      0.89      0.84     12500
    positive       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



In [7]:
pipe2 = Pipeline([("vectorizer", TfidfVectorizer()), ("logreg", SGDClassifier(loss = 'log', random_state=1234))], verbose=True)
pipe2.fit(train_data['review'], train_data['sentiment'])
predClass2 = pipe2.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass2, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  10.1s
[Pipeline] ............ (step 2 of 2) Processing logreg, total=   0.4s
              precision    recall  f1-score   support

    negative       0.88      0.87      0.88     12500
    positive       0.87      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [8]:
from sklearn.svm import LinearSVC
pipe3 = Pipeline([("vectorizer", TfidfVectorizer()), ("SVM", LinearSVC(random_state=1234))], verbose=True)
pipe3.fit(train_data['review'], train_data['sentiment'])
predClass3 = pipe3.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass3, target_names = train_data['sentiment'].unique()))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  10.0s
[Pipeline] ............... (step 2 of 2) Processing SVM, total=   0.9s
              precision    recall  f1-score   support

    negative       0.87      0.89      0.88     12500
    positive       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [149]:
from sklearn.base import BaseEstimator, TransformerMixin
class MeanSentenceVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def tokenizer(self, sentence):
        doc = nlp(sentence)
        preprocessed = [token.text for token in doc]
        return preprocessed
    
    def transform(self, X):
        return np.array(
            [np.mean([nlp.vocab[word].vector for word in self.tokenizer(sentence)], axis=0) for sentence in tqdm(X)]
            )

In [150]:
pipe4 = Pipeline([("vectorizer", MeanSentenceVectorizer()), ("SVM", LinearSVC(random_state=1234))], verbose=True)
pipe4.fit(train_data['review'], train_data['sentiment'])
predClass4 = pipe4.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass4, target_names = train_data['sentiment'].unique()))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))


[Pipeline] ........ (step 1 of 2) Processing vectorizer, total= 1.2min
[Pipeline] ............... (step 2 of 2) Processing SVM, total=   9.9s


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))


              precision    recall  f1-score   support

    negative       0.85      0.86      0.85     12500
    positive       0.86      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



# Using tensorflow:

In [11]:
import tensorflow as tf

In [12]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [13]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [102]:
###############################
# Slice the data for trials
r = int(1/2 * len(train_data))

tr_d = pd.concat([train_data[:r], train_data[12500:(12500+r)]])
te_d = pd.concat([test_data[:r], test_data[12500:(12500+r)]])
###############################

In [154]:
# preprocess Y
y_train = np.array([0 if sent == 'negative' else 1 for sent in tqdm(tr_d['sentiment'])])
y_test = np.array([0 if sent == 'negative' else 1 for sent in tqdm(te_d['sentiment'])])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [104]:
# preprocess X
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

In [105]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(np.array(tr_d['review']), y_train, test_size=0.2, random_state=1234)

In [106]:
longest_text = max(X_train, key=len)
num_words = len(longest_text.split(' '))
num_words

1839

In [107]:
vectorize_layer = TextVectorization(standardize=custom_standardization, output_sequence_length = num_words)
vectorize_layer.adapt(X_train)

In [108]:
x_train = vectorize_layer(X_train)
x_val = vectorize_layer(X_val)

In [111]:
num_words = len(vectorize_layer.get_vocabulary())
embd_dim = 100

In [112]:
# build model
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.regularizers import l2

np.random.seed(seed=12345)

model = Sequential()

model.add(Embedding(num_words, embd_dim, input_length=x_train.shape[1]))
model.add(Dropout(0.5))
model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation="relu"))
model.add(Dense(1, activation="sigmoid"))


""" model.add(Dense(
    300, activation='relu', input_shape = (X_train.shape[1],), 
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model.add(Dropout(0.3))

model.add(
    Dense(100, activation='relu',
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid')) """


model.compile(loss = binary_crossentropy, optimizer = Adam(learning_rate=0.01), metrics=['accuracy'])

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1839, 100)         9941500   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1839, 100)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 611, 128)          89728     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
Total params: 10,047,869
Trainable params: 10,047,869
Non-trainable params: 0
__________________________________________

In [113]:
history = model.fit(x_train, y_train, batch_size = 125, epochs = 10, verbose = 1, validation_data = (x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [114]:
x_test = vectorize_layer(np.array(te_d['review']))
model.evaluate(x_test, y_test, batch_size = 125)



[0.705986499786377, 0.8583599925041199]

In [151]:
msv = MeanSentenceVectorizer()

In [152]:
# preprocess Y
y_train = np.array([0 if sent == 'negative' else 1 for sent in tqdm(tr_d['sentiment'])])
y_test = np.array([0 if sent == 'negative' else 1 for sent in tqdm(te_d['sentiment'])])
tr_d_msv = msv.transform(tr_d['review'])
te_d_msv = msv.transform(te_d['review'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [155]:
X_train, X_val, y_train, y_val = train_test_split(tr_d_msv, y_train, test_size=0.2, random_state=1234)

In [171]:
np.random.seed(seed=12345)

model2 = Sequential()

model2.add(Dense(300, activation="relu", input_shape=(300,),
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model2.add(Dense(300, activation="relu",
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model2.add(Dense(300, activation="relu",
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model2.add(Dense(1, activation="sigmoid"))

model2.compile(loss = binary_crossentropy, optimizer = Adam(learning_rate=0.01), metrics=['accuracy'])

model2.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_56 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_57 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_58 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_59 (Dense)             (None, 1)                 301       
Total params: 271,201
Trainable params: 271,201
Non-trainable params: 0
_________________________________________________________________


In [172]:
history = model2.fit(X_train, y_train, batch_size = 125, epochs = 100, verbose = 2, validation_data = (X_val, y_val))

Epoch 1/100
160/160 - 3s - loss: 0.4559 - accuracy: 0.7876 - val_loss: 0.3712 - val_accuracy: 0.8384
Epoch 2/100
160/160 - 1s - loss: 0.3769 - accuracy: 0.8385 - val_loss: 0.3543 - val_accuracy: 0.8544
Epoch 3/100
160/160 - 1s - loss: 0.3595 - accuracy: 0.8485 - val_loss: 0.3509 - val_accuracy: 0.8482
Epoch 4/100
160/160 - 1s - loss: 0.3560 - accuracy: 0.8460 - val_loss: 0.3949 - val_accuracy: 0.8290
Epoch 5/100
160/160 - 1s - loss: 0.3664 - accuracy: 0.8404 - val_loss: 0.3626 - val_accuracy: 0.8410
Epoch 6/100
160/160 - 1s - loss: 0.3465 - accuracy: 0.8528 - val_loss: 0.3633 - val_accuracy: 0.8418
Epoch 7/100
160/160 - 1s - loss: 0.3521 - accuracy: 0.8481 - val_loss: 0.3439 - val_accuracy: 0.8548
Epoch 8/100
160/160 - 1s - loss: 0.3405 - accuracy: 0.8535 - val_loss: 0.3451 - val_accuracy: 0.8530
Epoch 9/100
160/160 - 1s - loss: 0.3428 - accuracy: 0.8535 - val_loss: 0.3497 - val_accuracy: 0.8518
Epoch 10/100
160/160 - 1s - loss: 0.3404 - accuracy: 0.8561 - val_loss: 0.3371 - val_accura

In [173]:
X_test = te_d_msv
model2.evaluate(X_test, y_test, batch_size = 125)



[0.39255258440971375, 0.8492000102996826]