# __Movie Sentiment Analysis__

In [1]:
import pandas as pd
import numpy as np
import re
import contractions
import nltk
import contractions
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import text_preprocessing as tp

from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from flashtext import KeywordProcessor


warnings.filterwarnings("ignore")

IMDB dataset for movie details and reviews: data source from https://www.kaggle.com/datasets/raynardj/imdb-vision-and-nlp/data

In [2]:
movies_df = pd.read_csv("./dataset/movies.csv")
movies_df.head(5)

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,local_image_path
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation;Adventure;Comedy,images/114709_.jpg
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action;Adventure;Family,images/113497_.jpg
2,113277,http://www.imdb.com/title/tt113277,Heat (1995),8.2,Action;Crime;Drama,images/113277_.jpg
3,114319,http://www.imdb.com/title/tt114319,Sabrina (1995),6.3,Comedy;Drama,images/114319_.jpg
4,114576,http://www.imdb.com/title/tt114576,Sudden Death (1995),5.7,Action;Crime;Thriller,images/114576_.jpg


In [3]:
reviews_df = pd.read_csv("./dataset/reviews.csv")
reviews_df.head(5)

Unnamed: 0,review_id,reviewer,movie,rating,review_summary,review_date,spoiler_tag,review_detail,helpful
0,rw1133942,OriginalMovieBuff21,Kill Bill: Vol. 2 (2004),8.0,Good a$$ follow up :) that <3 answers all the ...,24 July 2005,0,"After seeing Tarantino's Kill Bill Vol: 1, I g...","['0', '1']"
1,rw1133959,lost-in-limbo,Feardotcom (2002),3.0,"""I couldn't make much sense of it myself"". Too...",24 July 2005,0,There's a Website called FearDotCom and anyone...,"['1', '4']"
2,rw1133985,NateManD,Persona (1966),10.0,Persona gives me all the reasons to love art-h...,24 July 2005,0,"Long before ""Muholland Drive"" there was anothe...","['9', '23']"


In [4]:
# Rename the column names of movies info by removing the spaces and converting to lower case. Changed the "Title" to "movie" to
# match with the column name in review dataframe
movies_df = movies_df.rename(columns={"Imdb Link" : "imbd_link", "Title" : "movie", "IMDB Score" : "imdb_score", "Genre" : "genre"})
print(movies_df.columns)

Index(['imdbId', 'imbd_link', 'movie', 'imdb_score', 'genre',
       'local_image_path'],
      dtype='object')


In [5]:
# Merge the movie info and reviews in single dataframe using the "movie" column
movie_reviews_raw_df = pd.merge(movies_df, reviews_df, on=["movie"])
movie_reviews_raw_df.head(5)

Unnamed: 0,imdbId,imbd_link,movie,imdb_score,genre,local_image_path,review_id,reviewer,rating,review_summary,review_date,spoiler_tag,review_detail,helpful
0,295254,http://www.imdb.com/title/tt295254,Feardotcom (2002),3.3,Crime;Horror;Thriller,images/295254_.jpg,rw1133959,lost-in-limbo,3.0,"""I couldn't make much sense of it myself"". Too...",24 July 2005,0,There's a Website called FearDotCom and anyone...,"['1', '4']"
1,60827,http://www.imdb.com/title/tt60827,Persona (1966),8.1,Drama;Thriller,images/60827_.jpg,rw1133985,NateManD,10.0,Persona gives me all the reasons to love art-h...,24 July 2005,0,"Long before ""Muholland Drive"" there was anothe...","['9', '23']"
2,378194,http://www.imdb.com/title/tt378194,Kill Bill: Vol. 2 (2004),8.0,Action;Crime;Drama,images/378194_.jpg,rw1133942,OriginalMovieBuff21,8.0,Good a$$ follow up :) that <3 answers all the ...,24 July 2005,0,"After seeing Tarantino's Kill Bill Vol: 1, I g...","['0', '1']"


In [6]:
# Drop columns that are not relevant
del_columns = ['imdbId', 'imbd_link', 'imdb_score', 'local_image_path', 'review_id', 'reviewer', 'spoiler_tag', 'helpful']

movie_reviews_filtered_df = movie_reviews_raw_df.drop(columns=del_columns, axis=1)
movie_reviews_filtered_df.head(5)

Unnamed: 0,movie,genre,rating,review_summary,review_date,review_detail
0,Feardotcom (2002),Crime;Horror;Thriller,3.0,"""I couldn't make much sense of it myself"". Too...",24 July 2005,There's a Website called FearDotCom and anyone...
1,Persona (1966),Drama;Thriller,10.0,Persona gives me all the reasons to love art-h...,24 July 2005,"Long before ""Muholland Drive"" there was anothe..."
2,Kill Bill: Vol. 2 (2004),Action;Crime;Drama,8.0,Good a$$ follow up :) that <3 answers all the ...,24 July 2005,"After seeing Tarantino's Kill Bill Vol: 1, I g..."


In [7]:
# Convert genre into list
movie_reviews_filtered_df['genre'] = movie_reviews_filtered_df['genre'].apply(lambda x : x.split(";"))
movie_reviews_filtered_df['genre'].head(5)

0    [Crime, Horror, Thriller]
1            [Drama, Thriller]
2       [Action, Crime, Drama]
Name: genre, dtype: object

In [8]:
# Separate movie release year from the title
pattern_year = r'\((\d{4})\)$'

movie_reviews_filtered_df['release_year'] = movie_reviews_filtered_df['movie'].str.extract(pattern_year).astype(int)
movie_reviews_filtered_df['movie'] = movie_reviews_filtered_df['movie'].str.replace(pattern_year, '', regex=True).str.strip()
movie_reviews_filtered_df[['movie', 'release_year']].head(5)

Unnamed: 0,movie,release_year
0,Feardotcom,2002
1,Persona,1966
2,Kill Bill: Vol. 2,2004


In [9]:
movie_reviews_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   movie           3 non-null      object 
 1   genre           3 non-null      object 
 2   rating          3 non-null      float64
 3   review_summary  3 non-null      object 
 4   review_date     3 non-null      object 
 5   review_detail   3 non-null      object 
 6   release_year    3 non-null      int32  
dtypes: float64(1), int32(1), object(5)
memory usage: 180.0+ bytes


## Exploratory Data Analysis

In [10]:
# Check for missing values
movie_reviews_filtered_df.isna().sum().sort_values(ascending=True)

movie             0
genre             0
rating            0
review_summary    0
review_date       0
review_detail     0
release_year      0
dtype: int64

In [11]:
# Replace all missing values in review summary and detail
movie_reviews_filtered_df[['review_summary', 'review_detail']] = movie_reviews_filtered_df[['review_summary', 'review_detail']].fillna('')
movie_reviews_filtered_df[['review_summary', 'review_detail']].isna().sum()

review_summary    0
review_detail     0
dtype: int64

In [12]:
# Show statistical summary of numerical values in the dataframe
movie_reviews_filtered_df.describe()

Unnamed: 0,rating,release_year
count,3.0,3.0
mean,7.0,1990.666667
std,3.605551,21.385353
min,3.0,1966.0
25%,5.5,1984.0
50%,8.0,2002.0
75%,9.0,2003.0
max,10.0,2004.0


__For Text Pre-processing:__ 
- Remove non-grammatical text like emails and URLs
- Replace emojis with English word/s
- Handle contractions

In [13]:
emoji_dict = tp.get_emojis()
slang_word_dict = tp.webscrape_slang_words()

def initial_text_preprocessing(text):
    try:
        # Remove non-grammatical text
        text = tp.remove_email_address(text)
        text = tp.remove_hyperlink(text)

        # Replace emojis with English word/s
        text = emoji_dict.replace_keywords(text)

        # Handle contractions
        text = text.replace('İ', 'I')   # to handle errors thrown when fixing the contractions
        text = contractions.fix(text)

        # Replace slang words
        text = slang_word_dict.replace_keywords(text)

    except Exception as err:
        print(f"ERROR: {err}")
        print(f"Input Text: {text}")

    return text


text_prep_columns = ['movie', 'review_summary', 'review_detail']

movie_reviews_filtered_df[text_prep_columns] = movie_reviews_filtered_df[text_prep_columns].applymap(initial_text_preprocessing)
movie_reviews_filtered_df[text_prep_columns]

Unnamed: 0,movie,review_summary,review_detail
0,Feardotcom,"""I could not make much sense of it myself"". To...",There is a Website called FearDotCom and anyon...
1,Persona,Persona gives me all the reasons to love art-h...,"Long before ""Muholland Drive"" there was anothe..."
2,Kill Bill: Vol. 2,Good ass follow up happy face that heart answe...,"After seeing Tarantino's Kill Bill Vol: 1, I g..."


## Model Creation

## Model Evaluation

## References
- https://medium.com/analytics-vidhya/text-preprocessing-nlp-basics-430d54016048
- https://towardsdatascience.com/how-to-preprocess-social-media-data-and-text-messages-b011efacf74
- https://towardsdatascience.com/5-lesser-known-python-libraries-for-your-next-nlp-project-ff13fc652553
- https://medium.com/coinmonks/remaking-of-shortened-sms-tweet-post-slangs-and-word-contraction-into-sentences-nlp-7bd1bbc6fcff
- https://sunscrapers.com/blog/9-best-python-natural-language-processing-nlp/