In [2]:
import re
import warnings
import itertools
import numpy as np
import pandas as pd
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer

# **IMDB**

In [3]:
df = pd.read_csv('IMDB-Dataset.csv')

In [4]:
len(df)

50000

In [5]:
df.head(20)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [6]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
# !pip install nltk # Install nltk if not already installed
import nltk
nltk.download('wordnet') # Download the required 'wordnet' resource
from nltk.stem import WordNetLemmatizer

def clean_text(text):
  # Remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  # Remove punctuation and special characters
  text = re.sub(r'[^\w\s]', '', text)
  # Lemmatize to root form
  lemmatizer = WordNetLemmatizer()
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
  return text.lower().strip()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [9]:
df['cleaned_review'] = df['review'].apply(clean_text)

In [10]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned_review'], df['sentiment'], test_size=0.2, random_state=42
)

In [11]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one of the other reviewer ha mentioned that af...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,1,i thought this wa a wonderful way to spend tim...
3,Basically there's a family where a little boy ...,0,basically there a family where a little boy ja...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love in the time of money is a ...


In [12]:
print("Data preprocessing completed.")

Data preprocessing completed.


In [13]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

tfidf_vectorizer.fit(train_texts)

train_features = tfidf_vectorizer.transform(train_texts)
test_features = tfidf_vectorizer.transform(test_texts)

print("TF-IDF vectorization completed.")

TF-IDF vectorization completed.


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)

lr_model.fit(train_features, train_labels)

predictions = lr_model.predict(test_features)

accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(test_labels, predictions))


Accuracy: 0.90

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [15]:
# Example text to test
text_to_test = "the new software update is amazing"

cleaned_text = clean_text(text_to_test)

text_features = tfidf_vectorizer.transform([cleaned_text])

predicted_sentiment = lr_model.predict(text_features)[0]

if predicted_sentiment == 1:
    predicted_sentiment = "Positive"
else:
    predicted_sentiment = "Negative"

print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: Positive


In [16]:
import joblib

joblib.dump(lr_model, 'logistic_senti.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']