<a href="https://colab.research.google.com/github/ahmadmujtaba-dev/Hadoop_word_count/blob/main/Sentiment_Analysis_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models

In [3]:
def preprocess_text(text):
    """
    Preprocesses the input text by cleaning and normalizing.
    - Convert to lowercase
    - Remove special characters and punctuation
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [4]:
def tokenize_pad_sequences(corpus, maxlen=100, vocab_size=10000):
    """
    Tokenizes the text corpus and applies padding.
    """
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(corpus)
    sequences = tokenizer.texts_to_sequences(corpus)
    padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post')
    return padded_sequences, tokenizer


In [5]:
def build_lstm_model(input_length, vocab_size, embedding_dim=64):
    """
    Builds and compiles an LSTM model for text classification.
    """
    model = models.Sequential([
        layers.Embedding(vocab_size, embedding_dim, input_length=input_length),
        layers.LSTM(64, return_sequences=False),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [9]:
df = pd.read_csv("/content/IMDB Dataset.csv", encoding='utf-8', quotechar='"')

In [10]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [11]:
df.duplicated(subset = 'review').sum()

418

In [12]:
df.drop_duplicates(subset = 'review', keep = 'last' , inplace = True)

In [13]:
df['Cleaned_Review'] = df['review'].apply(preprocess_text)


In [14]:
# Step 5: Tokenize and pad sequences
X, tokenizer = tokenize_pad_sequences(df['Cleaned_Review'], maxlen=100, vocab_size=10000)

In [15]:
# Convert sentiment labels to numpy array
y = np.array(df['sentiment'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
#X_train, X_test, y_train,
y_test


array([0, 1, 0, ..., 1, 0, 0])

In [18]:
# Step 6: Build the LSTM model
model = build_lstm_model(input_length=200, vocab_size=10000)



In [23]:
df["sentiment"] = df["sentiment"].replace({"negative": 0, "positive": 1})

df.sample(10)

Unnamed: 0,review,sentiment,Cleaned_Review
19889,It is hard for a lover of the novel Northanger...,0,it is hard for a lover of the novel northanger...
27412,You probably heard this phrase when it come to...,0,you probably heard this phrase when it come to...
30246,"In a very short time, the movie showed a boy's...",1,in a very short time the movie showed a boys o...
35606,"""The Duke"" is a film based in the heart of the...",0,the duke is a film based in the heart of the b...
39440,This wonderful little film has all of the elem...,1,this wonderful little film has all of the elem...
18122,Aside from Frankie Muniz chattering too fast t...,0,aside from frankie muniz chattering too fast t...
15359,<br /><br />I've seen this movie during a fest...,0,br br ive seen this movie during a festival he...
11460,Firstly let me say that I didn't like the fact...,1,firstly let me say that i didnt like the fact ...
2239,"In all honesty, this series is as much a class...",1,in all honesty this series is as much a classi...
45246,Every movie critic and metal head hated this m...,1,every movie critic and metal head hated this m...


In [20]:
df

Unnamed: 0,review,sentiment,Cleaned_Review
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love in the time of money is a ...
...,...,...,...
49995,I thought this movie did a down right good job...,1,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,0,im going to have to disagree with the previous...


In [25]:
df["sentiment"] = df["sentiment"].replace({"negative": 0, "positive": 1})

In [26]:
print(df['sentiment'].unique())

[1 0]


In [29]:
y_test = np.where(y_test == 'negative', 0, 1)

In [36]:
 y_train = np.where(y_train == 'negative', 0, 1)

In [38]:
X_train, X_test, y_train, y_test

(array([[   1,    8,   30, ...,    8,    2,  151],
        [   8, 3097,    9, ...,    9,   36,    1],
        [   2,  251,  107, ...,  105,   79, 1252],
        ...,
        [1817,    2, 1751, ..., 4760,    1, 6988],
        [ 231,  388,    7, ...,   41,   22,    1],
        [ 361,    3,   56, ...,   14,   22, 1461]], dtype=int32),
 array([[   1,    9,   40, ...,  666,   46,    5],
        [ 327,    7, 5361, ..., 4629,  268,  305],
        [   6,   26,   56, ...,    4,  573, 2055],
        ...,
        [   2,  189,   12, ...,  524, 1121,    1],
        [   1, 1992,   66, ...,    1, 6394,  154],
        [ 199,   34,  336, ...,   22,    1, 1784]], dtype=int32),
 array([0, 1, 1, ..., 1, 1, 0]),
 array([0, 1, 0, ..., 1, 0, 0]))

In [39]:
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), verbose=2)

Epoch 1/5
1240/1240 - 81s - 65ms/step - accuracy: 0.7785 - loss: 0.4494 - val_accuracy: 0.8524 - val_loss: 0.3466
Epoch 2/5
1240/1240 - 82s - 66ms/step - accuracy: 0.8796 - loss: 0.2956 - val_accuracy: 0.8586 - val_loss: 0.3292
Epoch 3/5
1240/1240 - 84s - 67ms/step - accuracy: 0.9100 - loss: 0.2297 - val_accuracy: 0.8530 - val_loss: 0.3482
Epoch 4/5
1240/1240 - 79s - 64ms/step - accuracy: 0.9325 - loss: 0.1767 - val_accuracy: 0.8585 - val_loss: 0.3679
Epoch 5/5
1240/1240 - 82s - 66ms/step - accuracy: 0.9515 - loss: 0.1328 - val_accuracy: 0.8352 - val_loss: 0.4317


In [40]:
model.summary()

In [41]:
def preprocess_input(review, tokenizer, maxlen=100):

    # Preprocess the input text
    cleaned_review = preprocess_text(review)

    # Tokenize and pad the input text
    sequence = tokenizer.texts_to_sequences([cleaned_review])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding='post')

    return padded_sequence

In [42]:
def predict_sentiment(model, review, tokenizer):

    # Preprocess the input review
    processed_review = preprocess_input(review, tokenizer)

    # Predict the sentiment score
    sentiment_score = model.predict(processed_review)[0][0]  # Predict and extract score

    return sentiment_score

In [43]:
def sentiment_to_rating(sentiment_score):

    # Convert sentiment score (0 to 1) to rating (1 to 10)
    rating = int(sentiment_score * 9) + 1  # Scale from 1 to 10

    return rating

In [44]:
def test_model(model, review, tokenizer):

    # Predict sentiment score
    sentiment_score = predict_sentiment(model, review, tokenizer)

    # Convert sentiment score to rating
    rating = sentiment_to_rating(sentiment_score)

    # Output the results
    print(f"Review: {review}")
    print(f"Sentiment Score: {sentiment_score:.2f} (Scale: 0 to 1)")
    print(f"Rating: {rating}/10 (Based on Sentiment Score)")

    return sentiment_score, rating


In [45]:
new_review = "This movie was absolutely amazing with stunning visuals and a gripping plot!"
sentiment_score, rating = test_model(model, new_review, tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
Review: This movie was absolutely amazing with stunning visuals and a gripping plot!
Sentiment Score: 0.98 (Scale: 0 to 1)
Rating: 9/10 (Based on Sentiment Score)


In [46]:
model.save('my_lstm_model.h5')



In [47]:
model.save('my_model.keras')