## <center> Sentiment Analysis Of Movie Reviews </center>

In [None]:
import pandas as pd
import os

In [None]:
# Load data
rating_df = pd.read_csv('data/rating_auto_label_sentiment_two_classes.csv')

# drop unused columns
rating_df = rating_df [['review_text','sentiment']]
rating_df.head(2)
rating_df.shape

In [None]:
# Drop rows with NaN values in any column
rating_df = rating_df.dropna()
rating_df.shape

In [None]:
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Text Pre-processing
lemmatizer=WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'): #ADJECTIVE
        return wordnet.ADJ
    elif nltk_tag.startswith('V'): #VERN
        return wordnet.VERB
    elif nltk_tag.startswith('N'): #NOUN        
        return wordnet.NOUN
    elif nltk_tag.startswith('R'): #ADVERB
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) 
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # If no tag was found, then use the word as is
            lemmatized_sentence.append(word)
        else:        
            # Else use the tag to lemmatize the word
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def preprocess_text(df):
    df['review_text'] = df['review_text'].astype(str).fillna('')

    # remove white space
    df['review_text'] = df['review_text'].str.strip().str.replace(r'\s+', ' ', regex=True)

    # update to lower case
    df['review_text'] = df['review_text'].str.lower()

    # remove punctuations
    df['review_text'] = df['review_text'].str.replace(r'[{}]'.format(re.escape(string.punctuation)), '', regex=True)

    # remove special characters
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

    # remove digits
    df['review_text'] = df['review_text'].str.replace(r'\d+', '', regex=True)

    # remove non ascii
    df['review_text'] = df['review_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True)
    
    return df

def remove_stopwords(df):
    stop_words = stopwords.words('english') + ['br']
    stopwords_dict = Counter(stop_words)
    df['review_text'] = df['review_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_dict]))

    return df

def lemmatize(df):
    df['review_text'] = df['review_text'].apply(lambda x: lemmatize_sentence(x))

    return df

df_result = pd.DataFrame(columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'n_estimator', 'lrate', 'test_accuracy', 'wall_time','run_time'])
model_no = 1
filename="output/result.csv"

In [None]:
# Text Preprocessing
rating_df = preprocess_text(rating_df)
rating_df = remove_stopwords(rating_df)
rating_df = lemmatize(rating_df)

rating_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# X_train,X_test,y_train,y_test = train_test_split(rating_df.review_text,rating_df.sentiment,test_size = 0.2, random_state=42)

# 80% training, 20% temporary
X_train, X_temp, y_train, y_temp = train_test_split(rating_df.review_text, rating_df.sentiment, test_size=0.2, random_state=42)

# 10% validation, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
# SVM models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time 
from datetime import datetime

n_vects = ['tfidf']
n_grams = [2,3]
n_iters = [100000]
n_estimators=[100, 1000]
lrates = [1.0, 0.01, 0.001]
ensemble_models = ['AdaBoost','RandomForest']
# n_stop_lemmatize = [False, True]

for n_gram in n_grams:
    for n_vect in n_vects:
        if n_vect=='cbow':
            # Use all features, remove stopwords, apply unigram, bigram, trigram
            vect = CountVectorizer(max_features=None, ngram_range=(1,n_gram), stop_words='english', lowercase=True, strip_accents='ascii')
        else:
            vect = TfidfVectorizer(max_features=None, ngram_range=(1,n_gram), stop_words='english', lowercase=True, strip_accents='ascii')

        # Fit on training data and transform the training data to vector (document-term matrix)
        X_train_dtm = vect.fit_transform(X_train)
        # display(X_train_dtm)

        X_val_dtm = vect.transform(X_val)
        # display(X_val_dtm)

        X_test_dtm = vect.transform(X_test)
        # display(X_test_dtm)

        for n_iter in n_iters:
            for n_estimate in n_estimators:
                for lrate in lrates:
                    for ensemble_model in ensemble_models:
                    # Initialize the LogisticRegression classifier
                        svm_linear = LinearSVC(dual="auto", max_iter=n_iter, class_weight='balanced', random_state=42)

                        # Initialize the AdaBoostClassifier with LogisticRegression as the base estimator
                        if ensemble_model=='AdaBoost':
                            ensemble_model = AdaBoostClassifier(estimator=svm_linear, n_estimators=500, learning_rate=lrate, random_state=42, algorithm='SAMME')
                        else:
                            # RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)
                            ensemble_model = RandomForestClassifier(estimator=svm_linear, n_estimators=500, learning_rate=lrate, random_state=42, algorithm='SAMME')

                        # Train the classifier on the training data & capture wall time
                        start_time = time.time()
                        %time ensemble_model.fit(X_train_dtm, y_train)
                        end_time = time.time()
                        wall_time = end_time - start_time

                        # Predict and evaluate the classifier
                        y_val_pred = ensemble_model.predict(X_val_dtm)
                        val_accuracy = accuracy_score(y_val, y_val_pred)

                        # Final evaluation on test set after tuning
                        y_test_pred = ensemble_model.predict(X_test_dtm)
                        test_accuracy = accuracy_score(y_test, y_test_pred)

                        # Print result
                        task_no = str(model_no)
                        model = 'AdaBoost_LinearSVM'
                        print(f"{model} - {task_no}, text_preprocess: {True}, vectorizer: {n_vect}, ngram: {n_gram}, max_iter: {n_iter}, n_estimate:{n_estimate}, lrate:{lrate}")
                        print(f"Test Accuracy: {test_accuracy}\n")
                        model_no +=1

                        # Record result to dataframe, to be exported to csv
                        new_row = [model, task_no, n_vect, n_gram, n_iter, '', '', n_estimate, lrate, test_accuracy, wall_time, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
                        df_result.loc[len(df_result)] = new_row

                        new_row_df = pd.DataFrame([new_row], columns=df_result.columns)
                        new_row_df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

In [None]:
from sklearn.metrics import accuracy_score, classification_report

Report=classification_report(y_test,y_test_pred)
print(Report)

In [None]:
# Inference on new data
# new_reviews = ['A worthy contender for the Animated film of 2024', 'No plot at all. But if you are looking for a good laugh. You will not find that either.']
new_reviews = [
    "I absolutely love this movie! It was amazing.",
    "This movie was terrible, I hated every second of it.", 
    "while this movie is not intended for everyone, it is good for someone has no brain", 
    "let's watch it only when it is free to watch, i will not pay for it",
    'A worthy contender for the Animated film of 2024', 
    'No plot at all. But if you are looking for a good laugh. You will not find that either.'
]

new_reviews_dtm = vect.transform(new_reviews)
new_predictions = ensemble_model.predict(new_reviews_dtm)

print("New Predictions:", new_predictions)