### Classification of Movie Sentiment using NLP and Machine Learning

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk 
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Union, Tuple 
import re

In [2]:
# nltk.download('punkt') # tokenizer
# nltk.download('wordnet') # lexical database for the English language
# nltk.download('omw-1.4')  # Open Multilingual Wordnet (all languages)

# conda install -c conda-forge install spacy
# !python -m spacy download en_core_web_lg
# python -m spacy download en_core_web_md

nlp = spacy.load("en_core_web_lg")

In [4]:
def synthetic_reviews(num:int) -> tuple:
    example_reviews = [
        "A journey filled with unexpected challenges and awe-inspiring moments.",
        "A story about relationships and the intricate dynamics between characters in a quaint setting.",
        "An exploration of daily life with quirky behaviors and interactions.",
        "A tale that keeps the tension high with its eerie atmosphere and surprising twists.",
        "A gripping narrative filled with unexpected turns and edge-of-the-seat moments.",
        "An emotional story that delves into complex characters and their heart-wrenching experiences.",
        "An adventure set in a world full of wonders and enchanting sights.",
        "A heartfelt story of unexpected connections and profound emotions.",
        "A high-energy experience with intense sequences and spectacular displays of skill.",
        "A portrayal of everyday scenarios that bring warmth and laughter.",
        "A dark narrative that evokes a sense of fear and suspense with its ominous setting.",
        "An expedition through exotic locales, filled with daring feats and memorable encounters.",
        "A depiction of familial bonds and personal sacrifice, told with sensitivity and depth.",
        "An entertaining story with sharp dialogue and clever situations.",
        "A suspenseful narrative that keeps you on the edge with its clever plot and revelations.",
        "A journey through an imaginative world filled with extraordinary beings."
    ]
    
    example_genres = [
        ["Action", "Adventure", "Fantasy"],
        ["Romance", "Drama"],
        ["Comedy"],
        ["Horror", "Thriller"],
        ["Thriller"],
        ["Drama"],
        ["Fantasy"],
        ["Romance"],
        ["Action"],
        ["Comedy"],
        ["Horror"],
        ["Adventure"],
        ["Drama"],
        ["Comedy"],
        ["Thriller"],
        ["Fantasy"]
    ]
    
    reviews, genres = [], []
    
    for _ in range(num):
        idx = random.randint(0, len(example_reviews) - 1)
        reviews.append(example_reviews[idx])
        genres.append(example_genres[idx])
    return reviews, genres

def remove_punctuation(text: str) -> str:
    return re.sub(r'[^\w\s]', '', text)

def tokenize_input(input_list: list) -> list:
    tokenized = [word_tokenize(item) for item in input_list]
    return tokenized

# Function to find key words (nouns, verbs, adjectives, adverbs)
def find_keywords(text: str) -> List[str]:
    doc = nlp(text)
    keywords = [token.text for token in doc if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}]
    return keywords

# Function to get synonyms using WordNet and spaCy
def get_synonyms(word: str, context: str) -> List[str]:
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:
                synonyms.add(synonym)
    # Filter synonyms based on context
    doc, token = nlp(context), nlp(word)
    filtered_synonyms = [syn for syn in synonyms if nlp(syn).similarity(token) > 0.5]
    return list(filtered_synonyms)

# Augmentation methods
def synonym_replacement(input_str: str, n: int = 2) -> str:
    input_str = remove_punctuation(input_str)
    words = word_tokenize(input_str)
    new_words = words.copy()
    keywords = find_keywords(input_str)
    random.shuffle(keywords)
    num_replaced = 0
    for random_word in keywords:
        synonyms = get_synonyms(random_word, input_str)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

def random_insertion(input_str: str, n: int = 2) -> str:
    input_str = remove_punctuation(input_str)
    words = word_tokenize(input_str)
    for _ in range(n):
        new_synonyms = []
        while len(new_synonyms) == 0:
            keywords = find_keywords(input_str)
            if not keywords:
                break
            random_word = random.choice(keywords)
            new_synonyms = get_synonyms(random_word, input_str)
        if new_synonyms:
            random_synonym = random.choice(new_synonyms)
            random_index = random.randint(0, len(words) - 1)
            words.insert(random_index, random_synonym)
    return ' '.join(words)

def random_deletion(input_str: str, p: float = 0.2) -> str:
    input_str = remove_punctuation(input_str)
    words = word_tokenize(input_str)
    if len(words) == 1:
        return input_str
    keywords = find_keywords(input_str)
    new_words = [word for word in words if (word not in keywords or random.uniform(0, 1) > p)]
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)


def random_swap(input_str: str, n: int = 2) -> str:
    input_str = remove_punctuation(input_str)
    words = word_tokenize(input_str)
    keywords = find_keywords(input_str)
    for _ in range(n):
        if len(keywords) < 2:
            break
        idx1, idx2 = random.sample(range(len(words)), 2)
        if words[idx1] in keywords and words[idx2] in keywords:
            words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Main augmentation function
def augment_data(input_list: List[str], augment_factor: int = 4) -> List[str]:
    augmented_data = []
    for input_str in input_list:
        augmented_data.append(input_str)
        for _ in range(augment_factor):
            # augmented_data.append(synonym_replacement(input_str=input_str))
            # augmented_data.append(random_swap(input_str=input_str))
            # augmented_data.append(random_deletion(input_str=input_str))
            augmented_data.append(random_insertion(input_str=input_str))
    return augmented_data

# small demo
reviews = [
    "A journey filled with unexpected challenges and awe-inspiring moments.",
    "A story about relationships and the intricate dynamics between characters in a quaint setting."
]

test_reviews = augment_data(input_list=reviews, augment_factor=2)

print(f'\n{test_reviews[0]}\n{test_reviews[1]}')
print(f'\n{test_reviews[2]}\n{test_reviews[3]}')

  filtered_synonyms = [syn for syn in synonyms if nlp(syn).similarity(token) > 0.5]



A journey filled with unexpected challenges and awe-inspiring moments.
A journey filled with moment unexpected challenges and journeying aweinspiring moments

fill A journey filled with fill unexpected challenges and aweinspiring moments
A story about relationships and the intricate dynamics between characters in a quaint setting.


In [4]:
# generate synthetic reviews
num_reviews = 50
reviews, genres = synthetic_reviews(num=num_reviews)
reviews = augment_data(input_list=reviews, augment_factor=2)

# unique genres
unique_genres = sorted(set(genre for sublist in genres for genre in sublist))
print(f"Unique genres: {unique_genres}")

review_keywords = [find_keywords(review) for review in reviews]

# tokenize reviews
tokenized_reviews = tokenize_input(input_list=reviews)
# print(tokenized_reviews)

genre_descriptions = {
    "Action": "A genre of film that emphasizes physical feats, including fights, chases, explosions, and stunts.",
    "Adventure": "A genre that involves exploration, travel, and often includes a journey or quest.",
    "Fantasy": "A genre that features magical elements, mythical creatures, and fantastical worlds.",
    "Romance": "A genre focused on love stories and romantic relationships.",
    "Drama": "A genre that explores realistic characters, emotional themes, and intense character development.",
    "Comedy": "A genre designed to entertain and amuse, often through humor and satire.",
    "Horror": "A genre intended to scare, shock, or disgust, often featuring supernatural elements or monsters.",
    "Thriller": "A genre characterized by suspense, tension, and excitement, often involving crime or espionage."
}

# tokenize genres desc
tokenized_genre_descriptions = {genre: word_tokenize(description.lower()) for genre, description in genre_descriptions.items()}
# print(tokenized_genre_descriptions)

genre_keywords = {genre: find_keywords(description) for genre, description in genre_descriptions.items()}
binary_vectors = []

for true_genres in genres:
    binary_vector = [1 if genre in true_genres else 0 for genre in unique_genres]
    binary_vectors.append(binary_vector)

# Create DataFrame
data = {
    'Review': reviews,
    'Review Keywords': review_keywords,
    'Tokenized Reviews': tokenized_reviews,
}

data_review = data.copy()

# put most common keywords in the genre description
for genre in unique_genres:
    data[f'{genre} Keywords'] = [[keyword for keyword in genre_keywords[genre] if keyword in review] for review in reviews]

binary_df = pd.DataFrame(binary_vectors, columns=list(unique_genres))
# display(binary_df)

df_review = pd.DataFrame(data_review)
df = pd.DataFrame(data)
# display(df)

combined_df = pd.concat([df_review, binary_df], axis=1)
display(combined_df)

  filtered_synonyms = [syn for syn in synonyms if nlp(syn).similarity(token) > 0.5]


Unique genres: ['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Horror', 'Romance', 'Thriller']


Unnamed: 0,Review,Review Keywords,Tokenized Reviews,Action,Adventure,Comedy,Drama,Fantasy,Horror,Romance,Thriller
0,"An expedition through exotic locales, filled w...","[expedition, exotic, locales, filled, daring, ...","[An, expedition, through, exotic, locales, ,, ...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,An expedition through exotic locales filled wi...,"[expedition, exotic, locales, filled, feat, da...","[An, expedition, through, exotic, locales, fil...",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,An expedition through exotic locales filled wi...,"[expedition, exotic, locales, filled, daring, ...","[An, expedition, through, exotic, locales, fil...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,An emotional story that delves into complex ch...,"[emotional, story, delves, complex, characters...","[An, emotional, story, that, delves, into, com...",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,An emotional story that delves into complex ch...,"[emotional, story, delves, complex, character,...","[An, emotional, story, that, delves, into, com...",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
145,A portrayal laugh portraying of everyday scena...,"[portrayal, laugh, portraying, everyday, scena...","[A, portrayal, laugh, portraying, of, everyday...",,,,,,,,
146,A portrayal of unremarkable everyday scenarios...,"[portrayal, unremarkable, everyday, scenarios,...","[A, portrayal, of, unremarkable, everyday, sce...",,,,,,,,
147,An exploration of daily life with quirky behav...,"[exploration, daily, life, quirky, behaviors, ...","[An, exploration, of, daily, life, with, quirk...",,,,,,,,
148,An behavior exploration of daily life with fun...,"[behavior, exploration, daily, life, fundament...","[An, behavior, exploration, of, daily, life, w...",,,,,,,,


In [5]:
# seperate dataframes
data_genre = {
    'Genre': unique_genres,
    'Genre Keywords': [genre_keywords[genre] for genre in unique_genres],
    'Tokenized Genre Descriptions': [tokenized_genre_descriptions[genre] for genre in unique_genres],
}
genre_df = pd.DataFrame(data_genre)
display(genre_df)

genre_df = pd.DataFrame(genre_descriptions.items(), columns=['Genre', 'Genre Description'])

genre_df['Tokenized Description'] = genre_df['Genre Description'].apply(lambda x: word_tokenize(x.lower()))
# display(genre_df)
review_df = pd.DataFrame(reviews, columns=['Review'])
review_df['Tokenized Review'] = tokenized_reviews
pd.set_option('display.max_colwidth', None) # 
# display(review_df)

Unnamed: 0,Genre,Genre Keywords,Tokenized Genre Descriptions
0,Action,"[genre, film, emphasizes, physical, feats, inc...","[a, genre, of, film, that, emphasizes, physica..."
1,Adventure,"[genre, involves, exploration, travel, often, ...","[a, genre, that, involves, exploration, ,, tra..."
2,Comedy,"[genre, designed, entertain, often, humor, sat...","[a, genre, designed, to, entertain, and, amuse..."
3,Drama,"[genre, explores, realistic, characters, emoti...","[a, genre, that, explores, realistic, characte..."
4,Fantasy,"[genre, features, magical, elements, mythical,...","[a, genre, that, features, magical, elements, ..."
5,Horror,"[genre, intended, scare, shock, disgust, often...","[a, genre, intended, to, scare, ,, shock, ,, o..."
6,Romance,"[genre, focused, love, stories, romantic, rela...","[a, genre, focused, on, love, stories, and, ro..."
7,Thriller,"[genre, characterized, suspense, tension, exci...","[a, genre, characterized, by, suspense, ,, ten..."


---


### Visualizations

In [6]:
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx

1. Baic Bar Chart

In [7]:
def extract_column(df: pd.DataFrame, colname:str) -> List[str]:
    if colname not in df.columns:
        raise ValueError(f"Column '{colname}' not found in DataFrame")
    return df[colname].tolist()

def calculate_class_prob(df:pd.DataFrame, keyword_col:pd.DataFrame, class_cols:List[str]) -> pd.DataFrame:
    # Check if the keyword column and class columns exist in the DataFrame
    keyword_class_counts = {cls: {} for cls in class_cols}

    for _, row in df.iterrows():
        keywords = row[keyword_col]
        for key in keywords:
            for cls in class_cols:
                if row[cls] == 1:
                    if key not in keyword_class_counts[cls]:
                        keyword_class_counts[cls][key] = 0
                    keyword_class_counts[cls][key] += 1

    # Calculate probabilities
    prob = {cls: {} for cls in class_cols}
    for cls in class_cols:
        total_class_items = df[cls].sum()
        for key, count in keyword_class_counts[cls].items():
            prob[cls][key] = count / total_class_items

    # Convert to DataFrame
    prob_df = pd.DataFrame(prob).fillna(0).reset_index()
    prob_df = prob_df.melt(id_vars=['index'], var_name='Class', value_name='Probability')
    prob_df.columns = ['Keyword', 'Class', 'Probability']
    return prob_df


def freq_dist(df: pd.DataFrame, colname: str) -> pd.DataFrame:
    data = extract_column(df=df, colname=colname)
    all_data = [item for sublist in data for item in sublist]
    freq_df = pd.Series(all_data).value_counts().reset_index()
    freq_df.columns = ['Word', 'Frequency']
    return freq_df

def plot_keyword_freq(df: pd.DataFrame, colname: str, title: str, top_n: int = 35) -> None:
    df = df.head(top_n)  # Select top N keywords by frequency
    plt.figure(figsize=(12, 8))  # Increase plot size
    sns.barplot(x='Frequency', y='Word', data=df, palette='viridis')
    plt.title(title)
    plt.xlabel('Frequency')
    plt.ylabel('Keyword')
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.show()


def plot_class_prob(df: pd.DataFrame, title: str, top_n: int = 20) -> None:
    df = df.sort_values(by='Probability', ascending=False).head(top_n)
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Probability', y='Keyword', hue='Class', data=df, palette='viridis')
    plt.title(title)
    plt.xlabel('Probability')
    plt.ylabel('Keyword')
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(title='Class')
    plt.show()

# working dfs
freq_df = freq_dist(df=df_review, colname='Review Keywords')

keyword_col = 'Review Keywords'

prob_df = calculate_class_prob(df=df_review, keyword_col, class_cols=unique_genres)
# plot_class_prob(df=prob_df, title='Class Probabilities Based on Keywords', top_n=20)
display(df_review[keyword_col].to_list())

# working plots 
# plot_keyword_freq(df=freq_df, colname='Review Keywords', title='Review Keywords Frequency Distribution')

SyntaxError: positional argument follows keyword argument (641676538.py, line 69)

### WORKING: SNS Barplot, Freq Dist,

In [None]:
# seperate dataframes
data_genre = {
    'Genre': unique_genres,
    'Genre Keywords': [genre_keywords[genre] for genre in unique_genres],
    'Tokenized Genre Descriptions': [tokenized_genre_descriptions[genre] for genre in unique_genres],
}
genre_df = pd.DataFrame(data_genre)
display(genre_df)

genre_df = pd.DataFrame(genre_descriptions.items(), columns=['Genre', 'Genre Description'])

genre_df['Tokenized Description'] = genre_df['Genre Description'].apply(lambda x: word_tokenize(x.lower()))
# display(genre_df)
review_df = pd.DataFrame(reviews, columns=['Review'])
review_df['Tokenized Review'] = tokenized_reviews
pd.set_option('display.max_colwidth', None) # 
# display(review_df)

Unnamed: 0,Genre,Genre Keywords,Tokenized Genre Descriptions
0,Action,"[genre, film, emphasizes, physical, feats, including, fights, chases, explosions, stunts]","[a, genre, of, film, that, emphasizes, physical, feats, ,, including, fights, ,, chases, ,, explosions, ,, and, stunts, .]"
1,Adventure,"[genre, involves, exploration, travel, often, includes, journey, quest]","[a, genre, that, involves, exploration, ,, travel, ,, and, often, includes, a, journey, or, quest, .]"
2,Comedy,"[genre, designed, entertain, often, humor, satire]","[a, genre, designed, to, entertain, and, amuse, ,, often, through, humor, and, satire, .]"
3,Drama,"[genre, explores, realistic, characters, emotional, themes, intense, character, development]","[a, genre, that, explores, realistic, characters, ,, emotional, themes, ,, and, intense, character, development, .]"
4,Fantasy,"[genre, features, magical, elements, mythical, creatures, fantastical, worlds]","[a, genre, that, features, magical, elements, ,, mythical, creatures, ,, and, fantastical, worlds, .]"
5,Horror,"[genre, intended, scare, shock, disgust, often, featuring, supernatural, elements, monsters]","[a, genre, intended, to, scare, ,, shock, ,, or, disgust, ,, often, featuring, supernatural, elements, or, monsters, .]"
6,Romance,"[genre, focused, love, stories, romantic, relationships]","[a, genre, focused, on, love, stories, and, romantic, relationships, .]"
7,Thriller,"[genre, characterized, suspense, tension, excitement, often, involving, crime, espionage]","[a, genre, characterized, by, suspense, ,, tension, ,, and, excitement, ,, often, involving, crime, or, espionage, .]"
