## <center> Sentiment Analysis Of Movie Reviews </center>

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime
import os

In [9]:
# Load data
rating_df = pd.read_csv('data/rating_auto_label_sentiment_two_classes.csv')

# drop unused columns
rating_df = rating_df [['review_text','sentiment']]
rating_df.head(2)
rating_df.shape

(10468, 2)

In [10]:
# Drop rows with NaN values in any column
rating_df = rating_df.dropna()
rating_df.shape

(10462, 2)

In [11]:
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Text Pre-processing
lemmatizer=WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'): #ADJECTIVE
        return wordnet.ADJ
    elif nltk_tag.startswith('V'): #VERN
        return wordnet.VERB
    elif nltk_tag.startswith('N'): #NOUN        
        return wordnet.NOUN
    elif nltk_tag.startswith('R'): #ADVERB
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) 
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # If no tag was found, then use the word as is
            lemmatized_sentence.append(word)
        else:        
            # Else use the tag to lemmatize the word
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def preprocess_text(df):
    df['review_text'] = df['review_text'].astype(str).fillna('')

    # remove white space
    df['review_text'] = df['review_text'].str.strip().str.replace(r'\s+', ' ', regex=True)

    # update to lower case
    df['review_text'] = df['review_text'].str.lower()

    # remove punctuations
    df['review_text'] = df['review_text'].str.replace(r'[{}]'.format(re.escape(string.punctuation)), '', regex=True)

    # remove special characters
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

    # remove digits
    df['review_text'] = df['review_text'].str.replace(r'\d+', '', regex=True)

    # remove non ascii
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)
    
    # remove URL
    df['review_text'] = df['review_text'].str.replace(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)
    
    return df

def remove_stopwords(df):
    stop_words = stopwords.words('english') + ['br']
    stopwords_dict = Counter(stop_words)
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_dict]))

    return df

def lemmatize(df):
    df['review_text'] = df['review_text'].apply(lambda x: lemmatize_sentence(x))

    return df

df_result = pd.DataFrame(columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'n_estimator', 'lrate', 'test_accuracy', 'wall_time','run_time'])
model_no = 1
filename="output/result_DT.csv"

In [12]:
# Text Preprocessing
rating_df = preprocess_text(rating_df)
rating_df = remove_stopwords(rating_df)
rating_df = lemmatize(rating_df)

rating_df.head()

Unnamed: 0,review_text,sentiment
0,let leave door love beetlejuice edward scissor...,NEGATIVE
1,fast paced action thriller delivers begin end ...,POSITIVE
2,excellent movie great cast see movie saw one r...,POSITIVE
3,write three highpraise review try think bad mo...,NEGATIVE
4,well make movie quality write act cinematograp...,POSITIVE


In [13]:
from sklearn.model_selection import train_test_split

# X_train,X_test,y_train,y_test = train_test_split(rating_df.review_text,rating_df.sentiment,test_size = 0.2, random_state=42)

# 80% training, 20% temporary
X_train, X_temp, y_train, y_temp = train_test_split(rating_df.review_text, rating_df.sentiment, test_size=0.2, random_state=42)

# 10% validation, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(8369,)
(1046,)
(1047,)


In [None]:
# Decision Tree Model

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time 
from datetime import datetime

from sklearn.tree import _tree

vect = TfidfVectorizer(max_features=None, ngram_range=(1,3), stop_words='english', lowercase=True, strip_accents='ascii')

# Fit on training data and transform the training data to vector (document-term matrix)
X_train_dtm = vect.fit_transform(X_train)
# display(X_train_dtm)

X_val_dtm = vect.transform(X_val)
# display(X_val_dtm)

X_test_dtm = vect.transform(X_test)
# display(X_test_dtm)

# Train the full decision tree without pruning constraints
full_tree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
full_tree.fit(X_train_dtm, y_train)

# Function to prune the tree
def prune_tree(tree, alpha):
    # Get the tree structure
    tree_ = tree.tree_
    # Initialize a list to store nodes to prune
    nodes_to_prune = []

    # Traverse the tree and mark nodes for pruning
    def traverse(node):
        if tree_.children_left[node] != _tree.TREE_LEAF:
            traverse(tree_.children_left[node])
            traverse(tree_.children_right[node])
            # Prune if the node's impurity decrease is less than alpha
            if tree_.impurity[node] < alpha:
                nodes_to_prune.append(node)

    traverse(0)

    # Prune the marked nodes
    for node in nodes_to_prune:
        tree_.children_left[node] = _tree.TREE_LEAF
        tree_.children_right[node] = _tree.TREE_LEAF

# Apply post-pruning with a chosen alpha value
alpha = 0.01 
prune_tree(full_tree, alpha)

# Evaluate the pruned tree on the validation set
y_val_pred = full_tree.predict(X_val_dtm)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy after Pruning: {val_accuracy}")

# Final evaluation on the test set
y_test_pred = full_tree.predict(X_test_dtm)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy after Pruning: {test_accuracy}")

Validation Accuracy after Pruning: 0.624282982791587
Test Accuracy after Pruning: 0.6341929321872015


In [15]:
# Print the classification report of the best model
from sklearn.metrics import accuracy_score, classification_report

if best_y_test_pred is not None:
    print("The best model:", best_model)
    print("The best accuracy:", best_accuracy)

    Report=classification_report(y_test,best_y_test_pred)
    print("Classification Report of the Best Model:\n")
    print(Report)

NameError: name 'best_y_test_pred' is not defined

In [None]:
# Inference on new data
# new_reviews = ['A worthy contender for the Animated film of 2024', 'No plot at all. But if you are looking for a good laugh. You will not find that either.']
new_reviews = [
    "I absolutely love this movie! It was amazing.",
    "This movie was terrible, I hated every second of it.", 
    "while this movie is not intended for everyone, it is good for someone has no brain", 
    "let's watch it only when it is free to watch, i will not pay for it",
    'A worthy contender for the Animated film of 2024', 
    'No plot at all. But if you are looking for a good laugh. You will not find that either.'
]

new_reviews_dtm = vect.transform(new_reviews)
new_predictions = decision_tree.predict(new_reviews_dtm)

print("New Predictions:", new_predictions)

New Predictions: ['POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE']
