# Sentiment analysis with lstm

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [5]:
# Load dataset
df = pd.read_csv('archive/Reviews.csv') 
df=df[['Text','Score']]
df.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [7]:
# Convert Scores to binary sentiment (Positive: 4, 5 -> 1; Negative: 1, 2 -> 0; Neutral -> Drop)
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else (0 if x < 3 else None))
df = df.dropna(subset=['Sentiment'])

In [9]:
# Step 2: Tokenize and pad sequences
texts = df['Text'].values
sentiments = df['Sentiment'].values

In [11]:
# Tokenization
tokenizer = Tokenizer(num_words=20000)  # Keep the top 20,000 most frequent words
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [13]:
# Padding
max_length = 100  # Adjust based on your data's average text length
X = pad_sequences(sequences, maxlen=max_length)
y = to_categorical(sentiments)  # One-hot encode the sentiments

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Step 4: Build the LSTM model
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_length),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # 2 classes: Positive and Negative
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [23]:
# Step 5: Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=64)

Epoch 1/5
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 69ms/step - accuracy: 0.9044 - loss: 0.2444 - val_accuracy: 0.9475 - val_loss: 0.1405
Epoch 2/5
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 85ms/step - accuracy: 0.9561 - loss: 0.1205 - val_accuracy: 0.9576 - val_loss: 0.1159
Epoch 3/5
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 89ms/step - accuracy: 0.9697 - loss: 0.0864 - val_accuracy: 0.9620 - val_loss: 0.1063
Epoch 4/5
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 87ms/step - accuracy: 0.9782 - loss: 0.0628 - val_accuracy: 0.9631 - val_loss: 0.1178
Epoch 5/5
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 86ms/step - accuracy: 0.9836 - loss: 0.0473 - val_accuracy: 0.9624 - val_loss: 0.1259


In [27]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 16ms/step - accuracy: 0.9635 - loss: 0.1225
Test Loss: 0.12408111989498138
Test Accuracy: 0.963238000869751


In [31]:
import joblib

# Save the tokenizer
joblib.dump(tokenizer, 'tokenizer.pkl')
print("Tokenizer saved successfully.")


Tokenizer saved successfully.


In [25]:
# Save the model
model.save('sentiment_lstm_model.h5')
print("Model saved successfully.")




Model saved successfully.
