## <center> Sentiment Analysis Of Movie Reviews </center>

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime
import os

In [11]:
# Load data
rating_df = pd.read_csv('data/rating_auto_label_sentiment_two_classes.csv')

# drop unused columns
rating_df = rating_df [['review_text','sentiment']]
rating_df.head(2)
rating_df.shape

(10468, 2)

In [12]:
# Drop rows with NaN values in any column
rating_df = rating_df.dropna()
rating_df.shape

(10462, 2)

In [13]:
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Text Pre-processing
lemmatizer=WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'): #ADJECTIVE
        return wordnet.ADJ
    elif nltk_tag.startswith('V'): #VERN
        return wordnet.VERB
    elif nltk_tag.startswith('N'): #NOUN        
        return wordnet.NOUN
    elif nltk_tag.startswith('R'): #ADVERB
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) 
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # If no tag was found, then use the word as is
            lemmatized_sentence.append(word)
        else:        
            # Else use the tag to lemmatize the word
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def preprocess_text(df):
    df['review_text'] = df['review_text'].astype(str).fillna('')

    # remove white space
    df['review_text'] = df['review_text'].str.strip().str.replace(r'\s+', ' ', regex=True)

    # update to lower case
    df['review_text'] = df['review_text'].str.lower()

    # remove punctuations
    df['review_text'] = df['review_text'].str.replace(r'[{}]'.format(re.escape(string.punctuation)), '', regex=True)

    # remove special characters
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

    # remove digits
    df['review_text'] = df['review_text'].str.replace(r'\d+', '', regex=True)

    # remove non ascii
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)
    
    # remove URL
    df['review_text'] = df['review_text'].str.replace(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)
    
    return df

def remove_stopwords(df):
    stop_words = stopwords.words('english') + ['br']
    stopwords_dict = Counter(stop_words)
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_dict]))

    return df

def lemmatize(df):
    df['review_text'] = df['review_text'].apply(lambda x: lemmatize_sentence(x))

    return df

df_result = pd.DataFrame(columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'tree_param', 'n_estimator', 'lrate', 'batch_size', 'num_epoch', 'weight_decay', 'test_accuracy', 'wall_time', 'run_time'])
model_no = 1
filename="output/result_EL.csv"

In [14]:
# Text Preprocessing
rating_df = preprocess_text(rating_df)
rating_df = remove_stopwords(rating_df)
rating_df = lemmatize(rating_df)

rating_df.head()

Unnamed: 0,review_text,sentiment
0,let leave door love beetlejuice edward scissor...,NEGATIVE
1,fast paced action thriller delivers begin end ...,POSITIVE
2,excellent movie great cast see movie saw one r...,POSITIVE
3,write three highpraise review try think bad mo...,NEGATIVE
4,well make movie quality write act cinematograp...,POSITIVE


In [15]:
from sklearn.model_selection import train_test_split

# X_train,X_test,y_train,y_test = train_test_split(rating_df.review_text,rating_df.sentiment,test_size = 0.2, random_state=42)

# 80% training, 20% temporary
X_train, X_temp, y_train, y_temp = train_test_split(rating_df.review_text, rating_df.sentiment, test_size=0.2, random_state=42)

# 10% validation, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(8369,)
(1046,)
(1047,)


In [16]:
# Ensemble Model

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
import time 
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

n_grams = [1, 2, 3]
n_iter = 100000
n_vect = 'tfidf'
C = 1
gamma = 'scale'

best_model = ''
best_accuracy = 0
best_y_test_pred = None

for n_gram in n_grams:

    vect = TfidfVectorizer(max_features=None, ngram_range=(1,n_gram), stop_words='english', lowercase=True, strip_accents='ascii')

    # Fit on training data and transform the training data to vector (document-term matrix)
    X_train_dtm = vect.fit_transform(X_train)
    # display(X_train_dtm)

    X_val_dtm = vect.transform(X_val)
    # display(X_val_dtm)

    X_test_dtm = vect.transform(X_test)
    # display(X_test_dtm)

    # Initialize, scale, fit
    logreg_pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', LogisticRegression(max_iter=n_iter, class_weight='balanced', random_state=42))
    ])

    svm_linear_pipeline = Pipeline([
        ('scaler', StandardScaler( with_mean=False)),
        ('classifier', LinearSVC(dual="auto", max_iter=n_iter, class_weight='balanced', random_state=42))
    ])

    svm_rbf_pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', SVC(max_iter=n_iter, kernel='rbf', C=C, gamma=gamma, class_weight='balanced', random_state=42))
    ])

    decision_tree_pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', DecisionTreeClassifier(class_weight='balanced', random_state=42))
    ])

    # Create the ensemble model using VotingClassifier
    ensemble_model = VotingClassifier(estimators=[
        ('logreg', logreg_pipeline),
        # ('svm', svm_linear_pipeline),
        ('svm_rbf', svm_rbf_pipeline),
        ('dt', decision_tree_pipeline)
    ], voting='hard')

    # Train the ensemble model
    start_time = time.time()
    %time ensemble_model.fit(X_train_dtm, y_train)
    end_time = time.time()
    wall_time = end_time - start_time

    # Predict and evaluate the classifier
    y_val_pred = ensemble_model.predict(X_val_dtm)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Final evaluation on test set
    y_test_pred = ensemble_model.predict(X_test_dtm)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Print result
    task_no = str(model_no)
    model = 'Ensemble'
    print(f"{model} - {task_no}, text_preprocess: {True}, vectorizer: {n_vect}, ngram: {n_gram}, max_iter: {n_iter}")
    print(f"Test Accuracy: {test_accuracy}\n")
    model_no +=1

    # Record result to dataframe, to be exported to csv
    # columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'tree_param', 'n_estimator', 'lrate', 'batch_size', 'num_epoch', 'weight_decay', 'test_accuracy', 'wall_time', 'run_time']
    new_row = [model, task_no, n_vect, n_gram, n_iter, '', '', 'full_tree', 0, 0, 0, 0, 0, test_accuracy, wall_time, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
    df_result.loc[len(df_result)] = new_row

    new_row_df = pd.DataFrame([new_row], columns=df_result.columns)
    new_row_df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

    # Check for the best model
    if test_accuracy > best_accuracy:
        best_model = f"{model} - {task_no}, text_preprocess: {True}, vectorizer: {n_vect}, ngram: {n_gram}, max_iter: {n_iter}"
        best_accuracy = test_accuracy
        best_y_test_pred = y_test_pred

CPU times: total: 2min 1s
Wall time: 2min 10s
Ensemble - 1, text_preprocess: True, vectorizer: tfidf, ngram: 1, max_iter: 100000
Test Accuracy: 0.7363896848137536

CPU times: total: 4min 12s
Wall time: 5min 20s
Ensemble - 2, text_preprocess: True, vectorizer: tfidf, ngram: 2, max_iter: 100000
Test Accuracy: 0.6723973256924546

CPU times: total: 6min 35s
Wall time: 8min 35s
Ensemble - 3, text_preprocess: True, vectorizer: tfidf, ngram: 3, max_iter: 100000
Test Accuracy: 0.664756446991404



In [17]:
# Print the classification report of the best model
from sklearn.metrics import accuracy_score, classification_report

if best_y_test_pred is not None:
    print("The best model:", best_model)
    print("The best accuracy:", best_accuracy)

    Report=classification_report(y_test,best_y_test_pred)
    print("Classification Report of the Best Model:\n")
    print(Report)

The best model: Ensemble - 1, text_preprocess: True, vectorizer: tfidf, ngram: 1, max_iter: 100000
The best accuracy: 0.7363896848137536
Classification Report of the Best Model:

              precision    recall  f1-score   support

    NEGATIVE       0.74      0.45      0.56       388
    POSITIVE       0.74      0.91      0.81       659

    accuracy                           0.74      1047
   macro avg       0.74      0.68      0.68      1047
weighted avg       0.74      0.74      0.72      1047



In [18]:
# Inference on new data
# new_reviews = ['A worthy contender for the Animated film of 2024', 'No plot at all. But if you are looking for a good laugh. You will not find that either.']
new_reviews = [
    "I absolutely love this movie! It was amazing.",
    "This movie was terrible, I hated every second of it.", 
    "while this movie is not intended for everyone, it is good for someone has no brain", 
    "let's watch it only when it is free to watch, i will not pay for it",
    'A worthy contender for the Animated film of 2024', 
    'No plot at all. But if you are looking for a good laugh. You will not find that either.'
]

new_reviews_dtm = vect.transform(new_reviews)
new_predictions = ensemble_model.predict(new_reviews_dtm)

print("New Predictions:", new_predictions)

New Predictions: ['POSITIVE' 'NEGATIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE']
