In [75]:
import re
import nltk
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Dataset
Load the raw letterbox reviews dataset.

In [76]:
# Load dataset
df = pd.read_csv("../data/raw/letterboxd-reviews.csv", encoding='latin1')

# Work on a copy
clean_df = df.copy()

clean_df.head(10)

Unnamed: 0,Movie name,Release Year,Rating,Reviewer name,Review date,Review,Comment count,Like count
0,Aftersun (2022),2022,â??â??â??â??Â½,Tuomas,12-Jan-20,This review may contain spoilers.,130,"22,44 6 likes"
1,Joker (2019),2019,â??â??â??â??â??,Joao,20-Dec-22,if youâ??ve never swam in the ocean then of co...,1.8K,"22,032 likes"
2,Puss in Boots: The Last Wish (2022),2022,â??Â½,NicoPico,15-Sep-22,Puss in Boots: Into the Pussy-Verse,6 2,"21, 6 6 6 likes"
3,The Banshees of Inisherin (2022),2022,â??â??â??â??â??,Ella Kemp,8-Apr-22,I will NOT leave my donkey outside when Iâ??m sad,,"21, 6 09 likes"
4,Everything Everywhere All at Once (2022),2022,â??â??Â½,CosmonautMarkie,14-Aug-19,Watch it and have fun before film Twitter tell...,355,"20, 6 88 likes"
5,Parasite (2019),2019,â??â??â??â??,Philbert Dy,6 /21/2022,This review may contain spoilers.,169,"20,346 likes"
6,Nope (2022),2022,â??â??â??,tyler,6 / 6 /2022,this movie does for cloud spotting what did fo...,,"20,3 6 8 likes"
7,Thor: Love and Thunder (2022),2022,,24framesofnick,13-Oct-22,Great first draft! Excited to see when itâ??s ...,98,"20,316 likes"
8,The Menu (2022),2022,â??â??â??â??â??,Jay,25-Oct-21,This review may contain spoilers.,6 5,"20,359 likes"
9,Dune (2021),2021,â??â??â??â??,kÃ¡rsten,9-Aug-19,got the 4D experience by forgetting to drink w...,4,"19, 6 24 likes"


### Clean Column Names
Replace spaces and remove capitalization to make accessing columns easier.

In [77]:
# Clean column names
clean_df.columns = (
    clean_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

### Handle Unwanted Columns
Drop unnecessary columns (e.g., "review_date", "comment_count", "reviewer_name", and "release_year").

In [78]:
clean_df.drop(columns=["review_date", "comment_count", "reviewer_name", "release_year"], inplace=True)

### Handle Missing Values
Drop rows that are missing "movie_name" or "review".

In [79]:
# Drop rows with missing "movie_name" and "review".
print(clean_df.isna().sum())
clean_df = clean_df.dropna(subset=["movie_name", "review"])

movie_name       0
rating         309
review        1036
like_count    1278
dtype: int64


### Clean Movie Name

In [80]:
def clean_movie_title(title):
    if pd.isna(title):
        return title
    
    title = re.sub(r"\s*\(\d{4}\)\s*", "", str(title))

    return title.strip().lower()

clean_df["movie_name_clean"] = clean_df["movie_name"].apply(clean_movie_title)

# Sanity check
clean_df[["movie_name", "movie_name_clean"]].head(10)

Unnamed: 0,movie_name,movie_name_clean
0,Aftersun (2022),aftersun
1,Joker (2019),joker
2,Puss in Boots: The Last Wish (2022),puss in boots: the last wish
3,The Banshees of Inisherin (2022),the banshees of inisherin
4,Everything Everywhere All at Once (2022),everything everywhere all at once
5,Parasite (2019),parasite
6,Nope (2022),nope
7,Thor: Love and Thunder (2022),thor: love and thunder
8,The Menu (2022),the menu
9,Dune (2021),dune


### Clean Review
Clean "review" column by removing stop words and lemmatizing words to tokens. 

In [81]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_review(s):
    if pd.isna(s):
        return ""
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"[^a-zA-Z\s]", " ", s)
    s = s.lower()
    tokens = [lemmatizer.lemmatize(w) for w in s.split() if w not in stop_words]
    return " ".join(tokens)

clean_df["review_clean"] = clean_df["review"].apply(clean_review)

# Sanity check
clean_df[["review", "review_clean"]].head(10)

Unnamed: 0,review,review_clean
0,This review may contain spoilers.,review may contain spoiler
1,if youâ??ve never swam in the ocean then of co...,never swam ocean course pool seems deep
2,Puss in Boots: Into the Pussy-Verse,pus boot pussy verse
3,I will NOT leave my donkey outside when Iâ??m sad,leave donkey outside sad
4,Watch it and have fun before film Twitter tell...,watch fun film twitter tell overrated
5,This review may contain spoilers.,review may contain spoiler
6,this movie does for cloud spotting what did fo...,movie cloud spotting swimming
7,Great first draft! Excited to see when itâ??s ...,great first draft excited see finished
8,This review may contain spoilers.,review may contain spoiler
9,got the 4D experience by forgetting to drink w...,got experience forgetting drink water today wa...


### Fix Movie Rating
Add "numeric_rating" by counting the broken star rating symbols.

In [82]:
def fix_rating(r):
    if pd.isna(r):
        return None
    s = str(r)

    stars = s.count("â??")
    half = 0.5 if "Â½" in s else 0
    return stars + half

clean_df["numeric_rating"] = clean_df["rating"].apply(fix_rating)

# Sanity check
clean_df[["rating", "numeric_rating"]].head(10)

Unnamed: 0,rating,numeric_rating
0,â??â??â??â??Â½,4.5
1,â??â??â??â??â??,5.0
2,â??Â½,1.5
3,â??â??â??â??â??,5.0
4,â??â??Â½,2.5
5,â??â??â??â??,4.0
6,â??â??â??,3.0
7,,
8,â??â??â??â??â??,5.0
9,â??â??â??â??,4.0


### Clean "like_count"
Clean the messy "like_count" column and convert to an integer format.

In [83]:
def clean_count(s):
    if pd.isna(s):
        return None
    
    s = str(s).lower()
    s = s.replace("likes", "")
    s = s.replace(",", " ")

    if not s:
        return None
    
    digits = re.findall(r"\d+", s)
    if digits:
        num_str = "".join(digits)
        try:
            return int(num_str)
        except ValueError:
            return None
        
    return None

clean_df["like_count_clean"] = clean_df["like_count"].apply(clean_count)

# Sanity check
clean_df[["like_count", "like_count_clean"]].head(10)

Unnamed: 0,like_count,like_count_clean
0,"22,44 6 likes",22446.0
1,"22,032 likes",22032.0
2,"21, 6 6 6 likes",21666.0
3,"21, 6 09 likes",21609.0
4,"20, 6 88 likes",20688.0
5,"20,346 likes",20346.0
6,"20,3 6 8 likes",20368.0
7,"20,316 likes",20316.0
8,"20,359 likes",20359.0
9,"19, 6 24 likes",19624.0


### Drop and Replace Original Columns with Clean Ones
After performing sanity checks, replace original (messy) columns with clean ones.

In [84]:
# Drop and replace original columns with the clean ones
clean_df.drop(columns=["review", "rating", "movie_name", "like_count"], inplace=True)
clean_df.rename(columns={
    "numeric_rating": "rating", "review_clean": "review",
    "movie_name_clean": "movie_name", "like_count_clean": "like_count"
    }, inplace=True)

### Date Type Conversions
Convert "movie_name" and "review" columns to be string types.

In [85]:
# Convert "movie_name" to string
clean_df["movie_name"] = clean_df["movie_name"].astype("string")

# Convert "review" to string
clean_df["review"] = clean_df["review"].astype("string")


### Save Cleaned Data
Export the clean letterbox reviews data.

In [86]:
clean_df.to_csv("../data/processed/letterboxd_reviews_clean.csv", index=False)