In [125]:
import pandas as pd
import numpy as np 

In [126]:
df = pd.read_csv("IMDB Dataset.csv")
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [127]:
df.shape

(50000, 2)

In [128]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["sentiment"] = le.fit_transform(df["sentiment"].values)

df.head(9)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0


In [129]:
# Text preprocessing
import string
import nltk
from nltk.corpus import stopwords

In [130]:
# preprocessing data, removing punctuation, tokenizing, lemmatization

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")



[nltk_data] Downloading package stopwords to C:\Users\Zeeshan
[nltk_data]     Afridi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Zeeshan
[nltk_data]     Afridi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Zeeshan
[nltk_data]     Afridi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [131]:
def clean_text(text):

    # Remove punctuation and lowercase
    text = ''.join([c for c in text if c not in string.punctuation]).lower()

    # Tokenize and remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in nltk.word_tokenize(text) if token not in stop_words]

    # Lemmatize tokens
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return str([lemmatizer.lemmatize(token) for token in tokens])


In [132]:
# clean the reviews calumn, and append data into a new column cleaned_reviews
df["cleaned_reviews"] = pd.Series(df["review"].apply(clean_text))


In [133]:
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,1,"['one', 'reviewer', 'mentioned', 'watching', '..."
1,A wonderful little production. <br /><br />The...,1,"['wonderful', 'little', 'production', 'br', 'b..."
2,I thought this was a wonderful way to spend ti...,1,"['thought', 'wonderful', 'way', 'spend', 'time..."
3,Basically there's a family where a little boy ...,0,"['basically', 'there', 'family', 'little', 'bo..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"['petter', 'matteis', 'love', 'time', 'money',..."


In [134]:
cleaned_df = df[[str("cleaned_reviews"), "sentiment"]]
cleaned_df.head()

Unnamed: 0,cleaned_reviews,sentiment
0,"['one', 'reviewer', 'mentioned', 'watching', '...",1
1,"['wonderful', 'little', 'production', 'br', 'b...",1
2,"['thought', 'wonderful', 'way', 'spend', 'time...",1
3,"['basically', 'there', 'family', 'little', 'bo...",0
4,"['petter', 'matteis', 'love', 'time', 'money',...",1


In [135]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cleaned_reviews  50000 non-null  object
 1   sentiment        50000 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 586.1+ KB


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# models
from sklearn.naive_bayes import MultinomialNB   
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [137]:
tfidf = TfidfVectorizer(max_features=10000)

transformed_reviews = tfidf.fit_transform(cleaned_df["cleaned_reviews"]).toarray()



In [162]:
import pickle

# **Saving the trained model:**
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))  # Save model to file

In [138]:
x = transformed_reviews
y = cleaned_df["sentiment"]

In [139]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=145, test_size=.2, stratify=y)

In [140]:
print(x_train.shape)
print(x_test.shape)

(40000, 10000)
(10000, 10000)


In [141]:
# create an object of a the model
lor = LogisticRegression()

#fit the model
lor.fit(x_train, y_train)

# make predictions
y_pred = lor.predict(x_test)

# classifiction report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [154]:
user_input = "You are tereribl"

# Clean the user input
cleaned_text = clean_text(user_input)

# Vectorize the cleaned input using TF-IDF
vectorized_text = tfidf.transform([cleaned_text]).toarray()

# Make predictions using the trained logistic regression model
predictions = lor.predict(vectorized_text)

# Print the predictions
print("Predicted class:", predictions[0])

Predicted class: 0


In [156]:
def predict_sentiment(text):
  """Predicts sentiment for a given user input.

  Args:
      text: User-provided review text.

  Returns:
      A string representing the predicted sentiment (positive, negative, or neutral).
  """
  try:
    # Clean and vectorize the input
    vectorized_text = tfidf.transform([clean_text(text)]).toarray()

    # Make predictions using the model
    prediction = lor.predict(vectorized_text)[0]

    # Simplify sentiment label assignment (assuming 0 is negative, 1 is positive)
    sentiment = "Positive" if prediction == 1 else "Negative"  # Use ternary operator

    return sentiment

  except Exception as e:
    return f"Error: {str(e)}"

  
text = "This movie is terrifble, it was a bad experiecne watching this movie!"

predict_sentiment(text)


'Negative'

In [161]:
import pickle

# **Saving the trained model:**
pickle.dump(lor, open("IMDB_Classification_model.h5", "wb"))  # Save model to file