In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

In [None]:
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(50000, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [None]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
df['review'][10]

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"'

In [None]:
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_review(review):
    # Remove HTML tags
    review = re.sub(r'<[^>]+>', '', review)

    # Remove URLs
    review = re.sub(r'http\S+', '', review)
    review = re.sub(r'www\S+', '', review)
    review = re.sub(r'https\S+', '', review)

    # Remove special characters and punctuation
    review = re.sub(r'[^\w\s]', '', review)

    # Tokenize the text
    tokens = word_tokenize(review)

    # Convert to lowercase and remove non-alphabetic tokens
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the tokens back into a string
    return ' '.join(tokens)

# Preprocess all reviews in the DataFrame
df['review'] = df['review'].apply(preprocess_review)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['review'][10]

'phil alien one quirky film humour based around oddness everything rather actual punchlinesat first odd pretty funny movie progressed didnt find joke oddness funny anymoreits low budget film thats never problem pretty interesting character eventually lost interesti imagine film would appeal stoner currently partakingfor something similar better try brother another planet'

In [None]:
# Vectorize text using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['review'])
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8675
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.88      0.87      4961
    positive       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_val_predict

# Define parameter grid for hyperparameter tuning
param_grid = {
    'alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
    'fit_prior': [True, False]  # Whether to learn class prior probabilities or not
}

# Set up the GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy')

grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Perform cross-validation on the training data using the best model
cv_results = cross_val_score(best_model, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy')

print("Cross-validated scores:", cv_results)
print("Mean cross-validated accuracy:", cv_results.mean())
print("Standard deviation of cross-validated accuracy:", cv_results.std())

Best Parameters: {'alpha': 1.0, 'fit_prior': False}
Best Cross-Validation Score: 0.86165
Test Accuracy: 0.8674
Classification Report:
               precision    recall  f1-score   support

    negative       0.86      0.88      0.87      4961
    positive       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Cross-validated scores: [0.864    0.861375 0.862625 0.859625 0.860625]
Mean cross-validated accuracy: 0.86165
Standard deviation of cross-validated accuracy: 0.00152970585407784


In [None]:
def predict_sentiment(review):
    # Preprocess the review
    preprocessed_review = preprocess_review(review)

    # Vectorize the review using the trained TF-IDF vectorizer
    vectorized_review = tfidf.transform([preprocessed_review])

    # Predict the sentiment using the trained model
    prediction = best_model.predict(vectorized_review)[0]

    # Convert numerical prediction back to sentiment label
    sentiment_label = 'positive' if prediction == 1 else 'negative'

    return sentiment_label

new_review = "The hated every second of this movie!"
predicted_sentiment = predict_sentiment(new_review)
print(f"The sentiment for the review '{new_review}' is: {predicted_sentiment}")

The sentiment for the review 'The hated every second of this movie!' is: negative
