In [1]:
import re
import nltk
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Dataset
Load the raw metacritic reviews dataset.

In [2]:
# Load dataset
df = pd.read_csv("../data/raw/metacritic-reviews.csv", encoding='latin1', on_bad_lines="skip")

# Work on a copy
clean_df = df.copy()

clean_df.head(10)

Unnamed: 0,Movie name,Release Date,Rating,summary,User rating,Website rating
0,Touch of Evil,1-Feb-58,| PG-13,This film noir portrait of corruption and mora...,8.4,99
1,Seven Samurai,19-Nov-56,| Not Rated,Seven Samurai (Shichinin no samurai) tells the...,8.8,98
2,The Wild Bunch,18-Jun-69,| R,An aging group of outlaws look for one last bi...,7.6,98
3,Au hasard Balthazar,16-Sep-66,| Not Rated,Robert Bresson's 1966 film focuses on the stor...,7.1,98
4,The Lady Vanishes,1-Nov-38,| Approved,"While travelling in continental Europe, a rich...",8.1,98
5,The Treasure of the Sierra Madre,24-Jan-48,| TV-PG,"Fred Dobbs and Bob Curtin, two Americans searc...",8,98
6,Pan's Labyrinth,29-Dec-06,| R,"Following a bloody civil war, young Ofelia ent...",8.6,98
7,Some Like It Hot,29-Mar-59,| Approved,"When two male musicians witness a mob hit, the...",8.3,98
8,North by Northwest,6-Aug-59,| TV-G,A hapless New York advertising executive is mi...,8,98
9,The Rules of the Game,8-Apr-50,| Not Rated,A bourgeois life in France at the onset of Wor...,tbd,98


### Clean Column Names
Replace spaces and remove capitalization to make accessing columns easier.

In [3]:
# Clean column names
clean_df.columns = (
    clean_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

### Handle Unwanted Columns
Drop unncessary columns (e.g., "release_date", "rating", "user_rating", "website_rating").

In [4]:
clean_df.drop(columns=["release_date", "rating", "user_rating", "website_rating"], inplace=True)

### Handle Missing Values
Drop rows that are missing "movie_name" or "summary".

In [5]:
# Drop rows with missing "movie_name" and "summary".
print(clean_df.isna().sum())
clean_df = clean_df.dropna(subset=["movie_name", "summary"])

movie_name    0
summary       1
dtype: int64


### Clean Movie Name

In [6]:
def clean_movie_title(title):
    if pd.isna(title):
        return title
    
    title = re.sub(r"\s*\(\d{4}\)\s*", "", str(title))

    return title.strip().lower()

clean_df["movie_name_clean"] = clean_df["movie_name"].apply(clean_movie_title)

# Sanity check
clean_df[["movie_name", "movie_name_clean"]].head(10)

Unnamed: 0,movie_name,movie_name_clean
0,Touch of Evil,touch of evil
1,Seven Samurai,seven samurai
2,The Wild Bunch,the wild bunch
3,Au hasard Balthazar,au hasard balthazar
4,The Lady Vanishes,the lady vanishes
5,The Treasure of the Sierra Madre,the treasure of the sierra madre
6,Pan's Labyrinth,pan's labyrinth
7,Some Like It Hot,some like it hot
8,North by Northwest,north by northwest
9,The Rules of the Game,the rules of the game


### Clean Summary
Clean "summary" column by removing stop words and lemmatizing words to tokens. 

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_summary(s):
    if pd.isna(s):
        return ""
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"[^a-zA-Z\s]", " ", s)
    s = s.lower()
    tokens = [lemmatizer.lemmatize(w) for w in s.split() if w not in stop_words]
    return " ".join(tokens)

clean_df["summary_clean"] = clean_df["summary"].apply(clean_summary)

# Sanity check
clean_df[["summary", "summary_clean"]].head(10)

Unnamed: 0,summary,summary_clean
0,This film noir portrait of corruption and mora...,film noir portrait corruption morally compromi...
1,Seven Samurai (Shichinin no samurai) tells the...,seven samurai shichinin samurai tell story six...
2,An aging group of outlaws look for one last bi...,aging group outlaw look one last big score tra...
3,Robert Bresson's 1966 film focuses on the stor...,robert bresson film focus story donkey balthaz...
4,"While travelling in continental Europe, a rich...",travelling continental europe rich young playg...
5,"Fred Dobbs and Bob Curtin, two Americans searc...",fred dobbs bob curtin two american searching w...
6,"Following a bloody civil war, young Ofelia ent...",following bloody civil war young ofelia enters...
7,"When two male musicians witness a mob hit, the...",two male musician witness mob hit flee state f...
8,A hapless New York advertising executive is mi...,hapless new york advertising executive mistake...
9,A bourgeois life in France at the onset of Wor...,bourgeois life france onset world war ii rich ...


### Drop and Replace Original Columns with Clean Ones
After performing sanity checks, replace original (messy) columns with clean ones.

In [8]:
# Drop and replace original columns with the clean ones
clean_df.drop(columns=["movie_name", "summary"], inplace=True)
clean_df.rename(columns={"movie_name_clean": "movie_name", "summary_clean": "summary"}, inplace=True)

### Date Type Conversions
Convert "movie_name" and "summary" to strings.

In [9]:
# Convert "movie_name" to string
clean_df["movie_name"] = clean_df["movie_name"].astype("string")

# Convert "review" to string
clean_df["summary"] = clean_df["summary"].astype("string")

### Save Cleaned Data
Export the clean metacritic reviews data.

In [10]:
clean_df.to_csv("../data/processed/metacritic_reviews_clean.csv", index=False)