In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Perform tokenization
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Stem the text
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]

    # Join the stemmed tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)

    return preprocessed_text


In [None]:
import pandas as pd

df = pd.read_excel('/content/sample_data/IMDB_dataset.xlsx')  

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,positive
1,"Probably my all-time favorite movie, a story o...",positive
2,I sure would like to see a resurrection of a u...,positive
3,"This show was an amazing, fresh & innovative i...",negative
4,Encouraged by the positive comments about this...,negative


In [None]:
df.shape

(25000, 2)

using a total of 6000 samples due to memory constraints. to keep the data unbiased i have taken 3000 positive reviews and 3000 negative reviews. 

In [None]:
# Select 3,000 positive and 3,000 negative reviews
positive_reviews = df[df['sentiment'] == 'positive'].sample(n=3000, random_state=42)
negative_reviews = df[df['sentiment'] == 'negative'].sample(n=3000, random_state=42)

# Concatenate the positive and negative reviews into a new dataframe
selected_reviews = pd.concat([positive_reviews, negative_reviews], ignore_index=True)

# Print the shape and first few rows of the selected_reviews dataframe
print("Shape of selected_reviews dataframe: ", selected_reviews.shape)
print("First few rows of selected_reviews dataframe: \n", selected_reviews.head())


Shape of selected_reviews dataframe:  (6000, 2)
First few rows of selected_reviews dataframe: 
                                               review sentiment
0  Of course the average "Sci-Fi" Battle Star Gal...  positive
1  Sorry to say I have no idea what Hollywood is ...  positive
2  "The Lady from Shanghai" is well known as one ...  positive
3  Ed Harris and Cuba Gooding Jr. where cast perf...  positive
4  Kate Miller (Angie Dickinson) is having proble...  positive


In [None]:
# Apply the preprocess_text() function to the 'review' column
selected_reviews['review'] = selected_reviews['review'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer with desired parameters
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', stop_words='english')

# Fit and transform the preprocessed text data
X = vectorizer.fit_transform(selected_reviews['review'])
y = selected_reviews['sentiment']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert string labels to binary labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [10, 25, 50],
    'max_depth': [None, 10, 20],
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, scoring='accuracy', cv=5)
grid_search_rf.fit(X_train, y_train)

# Print the best parameters and accuracy for Random Forest
print("Best parameters for Random Forest: ", grid_search_rf.best_params_)
print("Best accuracy for Random Forest: ", grid_search_rf.best_score_)


Best parameters for Random Forest:  {'max_depth': None, 'n_estimators': 50}
Best accuracy for Random Forest:  0.819375


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [5, 10, 25],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7],
}

# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(xgb_classifier, param_grid_xgb, scoring='accuracy', cv=5)
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and accuracy for XGBoost
print("Best parameters for XGBoost: ", grid_search_xgb.best_params_)
print("Best accuracy for XGBoost: ", grid_search_xgb.best_score_)

Best parameters for XGBoost:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 25}
Best accuracy for XGBoost:  0.7822916666666666


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Random Forest classifier with best parameters
rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=None, random_state=42)

# Fit the Random Forest classifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_score_rf = f1_score(y_test, y_pred_rf)

# Print the evaluation metrics for Random Forest
print("Accuracy for Random Forest: ", accuracy_rf)
print("Precision for Random Forest: ", precision_rf)
print("Recall for Random Forest: ", recall_rf)
print("F1 Score for Random Forest: ", f1_score_rf)


Accuracy for Random Forest:  0.8375
Precision for Random Forest:  0.8554421768707483
Recall for Random Forest:  0.8205546492659054
F1 Score for Random Forest:  0.8376353039134055


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the XGBoost classifier with best parameters
xgb_classifier = XGBClassifier(n_estimators=25, learning_rate=0.1, max_depth=7, random_state=42)

# Fit the XGBoost classifier to the training data
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_xgb = xgb_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_score_xgb = f1_score(y_test, y_pred_xgb)

# Print the evaluation metrics for XGBoost
print("Accuracy for XGBoost: ", accuracy_xgb)
print("Precision for XGBoost: ", precision_xgb)
print("Recall for XGBoost: ", recall_xgb)
print("F1 Score for XGBoost: ", f1_score_xgb)


Accuracy for XGBoost:  0.79
Precision for XGBoost:  0.7604617604617605
Recall for XGBoost:  0.8597063621533442
F1 Score for XGBoost:  0.8070444104134762


Based on the evaluation metrics, it can be observed that Random Forest outperformed XGBoost in terms of accuracy, precision, and F1 score, while XGBoost performed better in terms of recall.

Accuracy: Random Forest achieved an accuracy of 0.8375, while XGBoost achieved an accuracy of 0.79. This means that Random Forest correctly predicted the sentiment of movie reviews with higher accuracy compared to XGBoost.

Precision: Random Forest achieved a precision of 0.8554, while XGBoost achieved a precision of 0.7605. Precision measures the ability of a model to correctly predict positive or negative sentiment. A higher precision indicates that Random Forest made fewer false positive predictions compared to XGBoost.

Recall: Random Forest achieved a recall of 0.8206, while XGBoost achieved a recall of 0.8597. Recall measures the ability of a model to correctly identify all positive or negative sentiment instances. A higher recall indicates that XGBoost made fewer false negative predictions compared to Random Forest.

F1 Score: Random Forest achieved an F1 score of 0.8376, while XGBoost achieved an F1 score of 0.8070. Random Forest has a slightly higher F1 score compared to XGBoost.