# **Sentiment Analysis App**


**# Step 1: Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
!pip install nltk
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**# Step 2: Load and Explore the Dataset**

In [3]:
df = pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df.shape

(50000, 2)

**# Step 3: Data Preprocessing (Cleaning)**


In [6]:
def preprocess_text(text):
    # Remove special characters and convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    return text

In [7]:
def remove_html_tags(text):
  """Removes HTML tags from a string."""
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

df['review'] = df['review'].apply(remove_html_tags)


In [8]:
df['review'] = df['review'].apply(preprocess_text)

In [9]:
 df.head(12)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
5,probably alltime favorite movie story selfless...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea first aired...,negative
8,encouraged positive comments film looking forw...,negative
9,like original gut wrenching laughter like movi...,positive


**# Step 4: Convert Text to Numeric Features (TF-IDF Vectorization)**

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)  # Experimenting with bigrams
X = tfidf.fit_transform(df['review']).toarray()
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Encode sentiment as 1/0

**# Step 5: Split Data into Training and Testing Sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

**# Step 6: Hyperparameter Tuning for Naive Bayes using Grid Search**

In [None]:
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and best accuracy
print(f"Best Parameters for Naive Bayes: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

Best Parameters for Naive Bayes: {'alpha': 0.1}
Best Cross-Validation Accuracy: 0.8594857142857142


In [None]:
# Use the best model from GridSearchCV
best_nb_model = grid_search.best_estimator_

**# Step 7: Evaluate the Best Naive Bayes Model**


In [None]:
y_pred_nb = best_nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.8615333333333334
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86      7411
           1       0.86      0.87      0.86      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



**# Step 8: Train and Evaluate Logistic Regression Model**


In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

In [None]:
# Predict and evaluate
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.8968
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.89      7411
           1       0.89      0.91      0.90      7589

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



**# Step 9: Cross-Validation for Robustness (for Logistic Regression)**


In [None]:
cv_scores = cross_val_score(logreg, X, y, cv=5)
print(f"Logistic Regression Cross-Validation Accuracy: {np.mean(cv_scores)}")
print(f"Standard Deviation: {np.std(cv_scores)}")

Logistic Regression Cross-Validation Accuracy: 0.8947999999999998
Standard Deviation: 0.0038662643468857848


**# Step 10: Save the Best Model in Directory**


In [None]:
pickle.dump(best_nb_model, open("Updated_sentiment_model_nb.pkl", "wb"))

In [None]:
pickle.dump(logreg, open("Updated_sentiment_model_logreg.pkl", "wb"))

In [None]:
with open("tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(tfidf, file)
