## <center> Sentiment Analysis Of Movie Reviews </center>
### <center> Logistic Regression </center>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime
import os

In [2]:
# Load data
rating_df = pd.read_csv('data/rating_auto_label_sentiment_two_classes.csv')

# drop unused columns
rating_df = rating_df [['review_text','sentiment']]
rating_df.head(2)
rating_df.shape

(10468, 2)

In [3]:
# Drop rows with NaN values in any column
rating_df = rating_df.dropna()
rating_df.shape

(10462, 2)

In [4]:
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Text Pre-processing
lemmatizer=WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'): #ADJECTIVE
        return wordnet.ADJ
    elif nltk_tag.startswith('V'): #VERN
        return wordnet.VERB
    elif nltk_tag.startswith('N'): #NOUN        
        return wordnet.NOUN
    elif nltk_tag.startswith('R'): #ADVERB
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) 
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # If no tag was found, then use the word as is
            lemmatized_sentence.append(word)
        else:        
            # Else use the tag to lemmatize the word
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def preprocess_text(df):
    df['review_text'] = df['review_text'].astype(str).fillna('')

    # remove white space
    df['review_text'] = df['review_text'].str.strip().str.replace(r'\s+', ' ', regex=True)

    # update to lower case
    df['review_text'] = df['review_text'].str.lower()

    # remove punctuations
    df['review_text'] = df['review_text'].str.replace(r'[{}]'.format(re.escape(string.punctuation)), '', regex=True)

    # remove special characters
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

    # remove digits
    df['review_text'] = df['review_text'].str.replace(r'\d+', '', regex=True)

    # remove non ascii
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)
    
    # remove URL
    df['review_text'] = df['review_text'].str.replace(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)
    
    return df

def remove_stopwords(df):
    stop_words = stopwords.words('english') + ['br']
    stopwords_dict = Counter(stop_words)
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_dict]))

    return df

def lemmatize(df):
    df['review_text'] = df['review_text'].apply(lambda x: lemmatize_sentence(x))

    return df

df_result = pd.DataFrame(columns=['task_no', 'model', 'ngram', 'vectorizer', 'max_iter', 'C', 'gamma', 'accuracy', 'wall_time','run_time'])
model_no = 1
filename="result.csv"

In [5]:
# Text Preprocessing
rating_df = preprocess_text(rating_df)
rating_df = remove_stopwords(rating_df)
rating_df = lemmatize(rating_df)

rating_df.head(20)

Unnamed: 0,review_text,sentiment
0,let leave door love beetlejuice edward scissor...,NEGATIVE
1,fast paced action thriller delivers begin end ...,POSITIVE
2,excellent movie great cast see movie saw one r...,POSITIVE
3,write three highpraise review try think bad mo...,NEGATIVE
4,well make movie quality write act cinematograp...,POSITIVE
5,true captain america modern comic book version...,POSITIVE
6,movie like wonder otherwise main character wor...,POSITIVE
7,thought safe go hike bush againalong come mick...,NEGATIVE
8,movie great,POSITIVE
9,sheriff freddy heflin stallone ordinary office...,NEGATIVE


In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(rating_df.review_text,rating_df.sentiment,test_size = 0.2, random_state=42)
X_test.shape

(2093,)

In [7]:
# Logistic Regression models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time 
from datetime import datetime

n_grams = [1,2,3]
n_vects = ['cbow', 'tfidf']
n_iters = [5000]
n_stop_lemmatize = [False, True]

for n_gram in n_grams:
    for n_vect in n_vects:
        if n_vect=='cbow':
            # Use all features, remove stopwords, apply unigram, bigram, trigram
            vect = CountVectorizer(max_features=None, ngram_range=(1,n_gram))
        else:
            vect = TfidfVectorizer(max_features=None, ngram_range=(1,n_gram))

        # Fit on training data and transform the training data to vector (document-term matrix)
        X_train_dtm = vect.fit_transform(X_train)
        # display(X_train_dtm)

        X_test_dtm = vect.transform(X_test)
        # display(X_test_dtm)

        for n_iter in n_iters:
            # 1. Initialize the LogisticRegression classifier
            logreg =  LogisticRegression(max_iter=n_iter, class_weight='balanced')

            # 2. Train the classifier on the training data & capture wall time
            start_time = time.time()
            %time logreg.fit(X_train_dtm, y_train)
            end_time = time.time()
            wall_time = end_time - start_time

            # 3. Predict and evaluate the classifier
            y_pred = logreg.predict(X_test_dtm)
            accuracy = accuracy_score(y_test, y_pred)

            # 4. Print result
            task_no = str(model_no)
            model = 'LogisticRegression'
            print(f"{task_no} - {model}, text_preprocess: {True}, vectorizer: {n_vect}, ngram: {n_gram}, max_iter: {n_iter}")
            print(f"Accuracy: {accuracy}\n")
            model_no +=1

            # 5. Record result to dataframe, to be exported to csv
            new_row = [task_no, model, n_gram, n_vect, n_iter, '', '', accuracy, wall_time, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
            df_result.loc[len(df_result)] = new_row

            new_row_df = pd.DataFrame([new_row], columns=df_result.columns)
            new_row_df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

CPU times: total: 1.05 s
Wall time: 2.53 s
1 - LogisticRegression, text_preprocess: True, vectorizer: cbow, ngram: 1, max_iter: 5000
Accuracy: 0.7290969899665551

CPU times: total: 46.9 ms
Wall time: 147 ms
2 - LogisticRegression, text_preprocess: True, vectorizer: tfidf, ngram: 1, max_iter: 5000
Accuracy: 0.7501194457716197

CPU times: total: 2.31 s
Wall time: 8.13 s
3 - LogisticRegression, text_preprocess: True, vectorizer: cbow, ngram: 2, max_iter: 5000
Accuracy: 0.7539417104634496

CPU times: total: 1.56 s
Wall time: 3.77 s
4 - LogisticRegression, text_preprocess: True, vectorizer: tfidf, ngram: 2, max_iter: 5000
Accuracy: 0.7649307214524605

CPU times: total: 2.77 s
Wall time: 11 s
5 - LogisticRegression, text_preprocess: True, vectorizer: cbow, ngram: 3, max_iter: 5000
Accuracy: 0.7601528905876732

CPU times: total: 969 ms
Wall time: 3.19 s
6 - LogisticRegression, text_preprocess: True, vectorizer: tfidf, ngram: 3, max_iter: 5000
Accuracy: 0.7606306736741519



In [8]:
from sklearn.metrics import accuracy_score, classification_report

Report=classification_report(y_test,y_pred)
print(Report)

              precision    recall  f1-score   support

    NEGATIVE       0.67      0.70      0.69       777
    POSITIVE       0.82      0.79      0.81      1316

    accuracy                           0.76      2093
   macro avg       0.74      0.75      0.75      2093
weighted avg       0.76      0.76      0.76      2093



In [9]:
# Inference on new data
new_reviews = ['A worthy contender for the Animated film of 2024', 'No plot at all. But if you are looking for a good laugh. You will not find that either.']
new_reviews_dtm = vect.transform(new_reviews)
new_predictions = logreg.predict(new_reviews_dtm)

print("New Predictions:", new_predictions)

New Predictions: ['POSITIVE' 'NEGATIVE']
