In [99]:
# reading the dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score
import pandas as pd



In [101]:
df = pd.read_csv(r"C:\Users\Umakant\Downloads\archive (2)\IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [103]:
df.shape

(50000, 2)

In [105]:
# checking for missing values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [41]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [43]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [49]:
def process_txt(text):
    # tokeninze the text
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

    

In [None]:
df['processed_review'] = df['review'].apply(process_txt)

In [107]:
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [59]:
X = df['processed_review']
y = df['sentiment']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [67]:
# Create TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [73]:
# X_train_tfidf[1]

In [77]:
clf = MultinomialNB()
# clf.fit(X_train, y_train)
clf.fit(X_train_tfidf, y_train)


In [81]:
y_pred = clf.predict(X_test_tfidf)
# y_pred

In [83]:
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.853


In [85]:
# function to predict sentiment for new review
def predict_sentiment(review):
    processed_review = process_txt(review)
    tfidf_vector = vectorizer.transform([processed_review])
    prediction = clf.predict(tfidf_vector)
    return prediction

In [87]:
# Test the model with some example reviews
example_reviews = [
    "This movie was absolutely amazing! The plot was engaging and the acting was superb.",
    "I was really disappointed with this film. The storyline was confusing and the characters were poorly developed.",
    "An average movie with some good moments, but overall it didn't live up to my expectations."
]

In [95]:
for review in example_reviews:
    sentiment = predict_sentiment(review)
    print(f"\nReview: {review}")
    print(f"Predicted sentiment: {sentiment}")


Review: This movie was absolutely amazing! The plot was engaging and the acting was superb.
Predicted sentiment: ['positive']

Review: I was really disappointed with this film. The storyline was confusing and the characters were poorly developed.
Predicted sentiment: ['negative']

Review: An average movie with some good moments, but overall it didn't live up to my expectations.
Predicted sentiment: ['positive']
