<a href="https://colab.research.google.com/github/adits16/Sentiment-Analysis-using-Tensorflow/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import spacy

# Load spaCy model
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# Define custom tokenizer
class CustomTokenizer:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')

    def text_data_cleaning(self, sentence):
        doc = self.nlp(sentence)
        tokens = [token.lemma_.lower().strip() for token in doc if token.lemma_ != '-PRON-']
        tokens = [token for token in tokens if token not in nlp.Defaults.stop_words and token not in string.punctuation]
        return ' '.join(tokens)

custom_tokenizer = CustomTokenizer()


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Load dataset
dataset = pd.read_csv('alexa_reviews_dataset.tsv', sep='\t')

# Create a new column 'Sentiment' based on overall ratings
def compute_sentiments(labels):
    return [1 if label > 3.0 else 0 for label in labels]

dataset['Sentiment'] = compute_sentiments(dataset['rating'])

# Select relevant columns
dataset = dataset[['verified_reviews', 'Sentiment']]
dataset.columns = ['Review', 'Sentiment']

# Check distribution of sentiments
print(dataset['Sentiment'].value_counts())

# Check for null values
print(dataset.isnull().sum())


Sentiment
1    2741
0     409
Name: count, dtype: int64
Review       1
Sentiment    0
dtype: int64


In [None]:
import string
dataset['Review'] = dataset['Review'].astype(str).fillna('')
dataset['Cleaned_Review'] = dataset['Review'].apply(custom_tokenizer.text_data_cleaning)



In [None]:
max_features = 5000
import joblib
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(dataset['Cleaned_Review'].values)
X = tokenizer.texts_to_sequences(dataset['Cleaned_Review'].values)
X = pad_sequences(X)
joblib.dump(tokenizer, 'tokenizer.pkl')
# Define target variable
Y = dataset['Sentiment'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [None]:
from google.colab import files
files.download('tokenizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Define the model
embedding_dim = 128
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 5
batch_size = 64

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=2)


Epoch 1/5
40/40 - 42s - loss: 0.4334 - accuracy: 0.8639 - val_loss: 0.3609 - val_accuracy: 0.8698 - 42s/epoch - 1s/step
Epoch 2/5
40/40 - 36s - loss: 0.3244 - accuracy: 0.8718 - val_loss: 0.2943 - val_accuracy: 0.8952 - 36s/epoch - 901ms/step
Epoch 3/5
40/40 - 35s - loss: 0.2168 - accuracy: 0.9139 - val_loss: 0.2237 - val_accuracy: 0.9048 - 35s/epoch - 882ms/step
Epoch 4/5
40/40 - 38s - loss: 0.1403 - accuracy: 0.9448 - val_loss: 0.2119 - val_accuracy: 0.9175 - 38s/epoch - 946ms/step
Epoch 5/5
40/40 - 35s - loss: 0.1015 - accuracy: 0.9631 - val_loss: 0.2179 - val_accuracy: 0.9175 - 35s/epoch - 869ms/step


In [None]:
# Predict on test data
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Confusion matrix
print(confusion_matrix(y_test, y_pred))

# Classification report
print(classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


[[ 47  35]
 [ 17 531]]
              precision    recall  f1-score   support

           0       0.73      0.57      0.64        82
           1       0.94      0.97      0.95       548

    accuracy                           0.92       630
   macro avg       0.84      0.77      0.80       630
weighted avg       0.91      0.92      0.91       630

Accuracy: 0.9174603174603174


In [None]:
round(accuracy_score(y_test, y_pred)*100,2)


91.75

In [None]:
# Save the model
model.save('sentiment_analysis_model.h5')

# Load the model
from tensorflow.keras.models import load_model
loaded_model = load_model('sentiment_analysis_model.h5')


In [None]:
from google.colab import files
files.download('sentiment_analysis_model.h5')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import tensorflow as tf
print(tf.__version__)


2.15.0


In [None]:
import keras
print(keras. __version__)

2.15.0
