In [25]:
import pandas as pd
import numpy as np
import re

from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

In [26]:
df = pd.read_csv('data/IMDB Dataset.csv')

In [27]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
df.shape

(50000, 2)

In [29]:
def preprocess_text(data):
    text = re.sub(r'[^a-zA-Z]', ' ', data)
    tokens = [word.lower() for word in text.split()]
    clean_text = ' '.join(tokens)
    return clean_text

In [30]:
df['review'] = df['review'].apply(preprocess_text)

In [31]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive


In [34]:
def df_to_corpus(df):
    corpus = []
    for i in range(df.shape[0]):
        text = df['review'][i]
        corpus.append(simple_preprocess(text))

    return corpus

In [35]:
corpus = df_to_corpus(df=df)

In [37]:
len(corpus)

50000

In [38]:
from gensim.models import Word2Vec

In [39]:
w2v = Word2Vec(corpus, vector_size=150, epochs=50)

In [40]:
w2v.corpus_count

50000

In [41]:
w2v.wv.key_to_index

{'the': 0,
 'and': 1,
 'of': 2,
 'to': 3,
 'is': 4,
 'br': 5,
 'it': 6,
 'in': 7,
 'this': 8,
 'that': 9,
 'was': 10,
 'as': 11,
 'movie': 12,
 'for': 13,
 'with': 14,
 'but': 15,
 'film': 16,
 'you': 17,
 'on': 18,
 'not': 19,
 'he': 20,
 'are': 21,
 'his': 22,
 'have': 23,
 'one': 24,
 'be': 25,
 'all': 26,
 'at': 27,
 'they': 28,
 'by': 29,
 'an': 30,
 'who': 31,
 'so': 32,
 'from': 33,
 'like': 34,
 'there': 35,
 'or': 36,
 'just': 37,
 'her': 38,
 'out': 39,
 'about': 40,
 'if': 41,
 'has': 42,
 'what': 43,
 'some': 44,
 'good': 45,
 'can': 46,
 'when': 47,
 'more': 48,
 'very': 49,
 'she': 50,
 'up': 51,
 'no': 52,
 'time': 53,
 'my': 54,
 'even': 55,
 'would': 56,
 'which': 57,
 'only': 58,
 'story': 59,
 'really': 60,
 'see': 61,
 'their': 62,
 'had': 63,
 'me': 64,
 'well': 65,
 'we': 66,
 'were': 67,
 'than': 68,
 'much': 69,
 'bad': 70,
 'get': 71,
 'been': 72,
 'other': 73,
 'do': 74,
 'people': 75,
 'great': 76,
 'will': 77,
 'also': 78,
 'into': 79,
 'because': 80,
 'how'

In [42]:
def avg_word2vec(tokens):
    word_embeddings = [w2v.wv[word] for word in tokens if word in w2v.wv.index_to_key]
    avg = np.mean(word_embeddings, axis=0)

    return avg if word_embeddings else np.zeros(w2v.vector_size)

In [46]:
from tqdm import tqdm

In [50]:
final_embeddings = []
for i in tqdm(range(len(corpus))):
    final_embeddings.append(avg_word2vec(corpus[i]))

100%|██████████| 50000/50000 [16:29<00:00, 50.53it/s]


In [51]:
X = np.array(final_embeddings)
y = df['sentiment'].map({'positive' : 1, 'negative' : 0})

In [52]:
X.shape

(50000, 150)

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [55]:
models = {
    'lor' : LogisticRegression(),
    'svc' : SVC(),
    'rf' : RandomForestClassifier()
}

for name,mod in models.items():
    model = mod
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print('=' * 50)
    print(name)
    print('Training Evaluation')
    print('-' * 50)

    print(precision_score(y_train, y_pred_train))
    print(recall_score(y_train, y_pred_train))
    print(accuracy_score(y_train, y_pred_train))

    print('-' * 50)
    print('Testing Evaluation')
    print('-' * 50)

    print(precision_score(y_test, y_pred_test))
    print(recall_score(y_test, y_pred_test))
    print(accuracy_score(y_test, y_pred_test))
    

lor
Training Evaluation
--------------------------------------------------
0.8721688400823611
0.8758256274768824
0.8743714285714286
--------------------------------------------------
Testing Evaluation
--------------------------------------------------
0.8760537407797682
0.8764000527078667
0.8747333333333334
svc
Training Evaluation
--------------------------------------------------
0.8883411670296916
0.890126931250359
0.8896857142857143
--------------------------------------------------
Testing Evaluation
--------------------------------------------------
0.8788990825688073
0.8836473843721175
0.8795333333333333
rf
Training Evaluation
--------------------------------------------------
1.0
1.0
1.0
--------------------------------------------------
Testing Evaluation
--------------------------------------------------
0.8272339324970904
0.8429305573856898
0.8314666666666667


# **Creating Neural Network Architecture**

In [80]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

In [128]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),

    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [129]:
model.summary()

In [130]:
tb_callback = TensorBoard(log_dir='log/', histogram_freq=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [131]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, callbacks=[tb_callback, early_stopping])

Epoch 1/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7969 - loss: 0.4272 - val_accuracy: 0.8671 - val_loss: 0.3113
Epoch 2/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8676 - loss: 0.3130 - val_accuracy: 0.8694 - val_loss: 0.3065
Epoch 3/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8724 - loss: 0.3056 - val_accuracy: 0.8719 - val_loss: 0.3022
Epoch 4/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8741 - loss: 0.3018 - val_accuracy: 0.8729 - val_loss: 0.3008
Epoch 5/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8784 - loss: 0.2903 - val_accuracy: 0.8770 - val_loss: 0.2951
Epoch 6/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8780 - loss: 0.2904 - val_accuracy: 0.8756 - val_loss: 0.2918
Epoch 7/50
[1m1

<keras.src.callbacks.history.History at 0x264886d41a0>

In [183]:
text = 'I appreciate the movie hard work'

In [184]:
text = preprocess_text(text)

In [185]:
tokens = simple_preprocess(text)

In [186]:
avg_embedding = avg_word2vec(tokens)

In [187]:
prediction = model.predict(avg_embedding.reshape(1,-1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


In [188]:
def class_returner(prediciton):
    if prediciton >= 0.6:
        print('Positive')

    elif prediciton < 0.4:
        print('Negative')

    else:
        print('Neutral')

In [189]:
class_returner(prediction)

Positive


In [192]:
model.save('models/model_file.h5')

