In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')
import sweetviz as sv

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
print('sentiment`s distribution')
df['sentiment'].value_counts()

sentiment`s distribution


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
df['sentiment'] = df['sentiment'].map({'positive' : 1 , 'negative' : 0 })

In [6]:
print (df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB
None
                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
ps = PorterStemmer()

In [10]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ' , text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

df['clean_review'] = df['review'].apply(clean_text)
print (df['clean_review'].head())

0    one review mention watch oz episod hook right ...
1    wonder littl product br br film techniqu unass...
2    thought wonder way spend time hot summer weeke...
3    basic famili littl boy jake think zombi closet...
4    petter mattei love time money visual stun film...
Name: clean_review, dtype: object


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [17]:
MAX_WORDS = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words = MAX_WORDS , oov_token = '<unk>')
tokenizer.fit_on_texts(df['clean_review'])

sequences = tokenizer.texts_to_sequences(df['clean_review'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')


In [21]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)
print(X_train.shape)
X_test.shape

(40000, 100)


(10000, 100)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [23]:
EMBEDDING_DIM = 128

model = Sequential()

model.add(Embedding(MAX_WORDS ,EMBEDDING_DIM , input_length = MAX_LEN))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [24]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 29ms/step - accuracy: 0.6012 - loss: 0.6566 - val_accuracy: 0.5466 - val_loss: 0.6839
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 31ms/step - accuracy: 0.6944 - loss: 0.5889 - val_accuracy: 0.7262 - val_loss: 0.5477
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 30ms/step - accuracy: 0.8292 - loss: 0.4186 - val_accuracy: 0.8322 - val_loss: 0.4007
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 27ms/step - accuracy: 0.8677 - loss: 0.3376 - val_accuracy: 0.8661 - val_loss: 0.3215
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 28ms/step - accuracy: 0.9125 - loss: 0.2414 - val_accuracy: 0.8615 - val_loss: 0.3377


In [25]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Loss: 0.3377
Accuracy: 0.8615


In [27]:
import mlflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

MAX_WORDS = 10000
MAX_LEN = 100
EMBEDDING_DIM = 128

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)

with mlflow.start_run(run_name="LSTM_Sentiment_Baseline"):
    mlflow.log_param("max_words", MAX_WORDS)
    mlflow.log_param("max_len", MAX_LEN)
    mlflow.log_param("embedding_dim", EMBEDDING_DIM)
    mlflow.log_param("epochs", 5)
    
    model = Sequential()
    model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), verbose=0)
    
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    mlflow.log_metric("final_loss", loss)
    mlflow.log_metric("final_accuracy", accuracy)
    mlflow.keras.log_model(model, name="lstm_sentiment_model")

    print("MLflow!")



MLflow!
