Clean and handle letterboxd data
- countries.csv
- genres.csv
- languages.csv
- movies.csv
- themes.csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../utils')
import data_cleaning
import data_inspection
import helpers

In [20]:
genres = pd.read_csv('../data/local/letterboxd/genres.csv')
countries = pd.read_csv('../data/local/letterboxd/countries.csv')
languages = pd.read_csv('../data/local/letterboxd/languages.csv')
movies = pd.read_csv('../data/local/letterboxd/movies.csv')
themes = pd.read_csv('../data/local/letterboxd/themes.csv')
crew = pd.read_csv('../data/local/letterboxd/crew.csv')

In [4]:
display(movies)

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [5]:
data_inspection.show_basic_info(movies)


DataFrame Shape: (941597, 7)
Number of Rows: 941597
Number of Columns: 7

Data Types of Columns:
id               int64
name            object
date           float64
tagline         object
description     object
minute         float64
rating         float64
dtype: object

Missing Values per Column:
id                  0
name               10
date            91913
tagline        802210
description    160812
minute         181570
rating         850598
dtype: int64

First 5 Rows of Data:
        id                               name    date  \
0  1000001                             Barbie  2023.0   
1  1000002                           Parasite  2019.0   
2  1000003  Everything Everywhere All at Once  2022.0   
3  1000004                         Fight Club  1999.0   
4  1000005                         La La Land  2016.0   

                                            tagline  \
0                  She's everything. He's just Ken.   
1                       Act like you own the place.   
2

In [6]:
data_inspection.check_for_duplicates(movies)


No duplicate rows found in the DataFrame.


In [7]:
# convert rows to int
columns_to_convert = ['date', 'minute']
data_cleaning.convert_columns_to_int(movies, columns_to_convert)

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.30
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [8]:
# get sample df with movies with no empty rows
movies_sample = movies.dropna()

display(movies_sample)

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.30
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09
...,...,...,...,...,...,...,...
152402,1152403,Karmayodha,2012,Bad ones beware.,"An investigative thriller from Major Ravi, wit...",123,2.78
152849,1152850,Staging,2021,Staging,"Assad, along with his group, creates fake acci...",96,2.66
158484,1158485,Kaalai,2008,There is no substitute,"A man comes to the city, seeking revenge from ...",133,2.75
161263,1161264,Fading Petals,2022,The tumultuous relationship between a Young Wo...,The paths of two solitary figures cross when a...,93,3.11


In [9]:
# drop rows with runtime under 40'
helpers.drop_rows_by_runtime(movies_sample, 'minute', 40)

Number of rows dropped (runtime < 40): 1347


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=[column_name], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df[column_name] < min_runtime].index, inplace=True)


Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.30
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09
...,...,...,...,...,...,...,...
152402,1152403,Karmayodha,2012,Bad ones beware.,"An investigative thriller from Major Ravi, wit...",123,2.78
152849,1152850,Staging,2021,Staging,"Assad, along with his group, creates fake acci...",96,2.66
158484,1158485,Kaalai,2008,There is no substitute,"A man comes to the city, seeking revenge from ...",133,2.75
161263,1161264,Fading Petals,2022,The tumultuous relationship between a Young Wo...,The paths of two solitary figures cross when a...,93,3.11


In [10]:
data_inspection.show_missing_values(themes)


Missing Values in Columns:
id       0
theme    0
dtype: int64


In [11]:
# Group genres by 'id' and join the genre values into a comma-separated string
genres_grouped = genres.groupby('id')['genre'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'genres' for clarity
genres_grouped.rename(columns={'genre': 'genres'}, inplace=True)

# Merge the grouped genres into movies_sample based on 'id'
movies_sample = movies_sample.merge(genres_grouped, on='id', how='left')

# Replace NaN values in 'genres' column with an empty string if needed
movies_sample['genres'] = movies_sample['genres'].fillna('')

In [12]:
# Group genres by 'id' and join the genre values into a comma-separated string
language_grouped = languages.groupby('id')['language'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'language' for clarity
language_grouped.rename(columns={'language': 'language'}, inplace=True)

# Merge the grouped language into movies_sample based on 'id'
movies_sample = movies_sample.merge(language_grouped, on='id', how='left')

# Replace NaN values in 'language' column with an empty string if needed
movies_sample['language'] = movies_sample['language'].fillna('')

In [13]:
# Group genres by 'id' and join the genre values into a comma-separated string
countries_grouped = countries.groupby('id')['country'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'countries' for clarity
countries_grouped.rename(columns={'country': 'country'}, inplace=True)

# Merge the grouped countries into movies_sample based on 'id'
movies_sample = movies_sample.merge(countries_grouped, on='id', how='left')

# Replace NaN values in 'countries' column with an empty string if needed
movies_sample['country'] = movies_sample['country'].fillna('')

In [14]:
# Group themes by 'id' and combine the 'theme' column values into a list
themes_grouped = themes.groupby('id')['theme'].apply(lambda x: list(x)).reset_index()

# Merge the grouped themes into movies_sample based on 'id'
movies_sample = movies_sample.merge(themes_grouped, on='id', how='left')

# Replace NaN values in 'theme' column with a list containing a single placeholder or a default message
movies_sample['theme'] = movies_sample['theme'].apply(lambda x: x if isinstance(x, list) else ['No themes'])

In [22]:
# Filter for rows where the role is 'Director'
directors_df = crew[crew['role'] == 'Director']

# Group by 'id' and join director names into a comma-separated string
directors_grouped = directors_df.groupby('id')['name'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'name' column to 'director' for clarity
directors_grouped.rename(columns={'name': 'director'}, inplace=True)

# Merge the grouped directors into movies_sample based on 'id'
movies_sample = movies_sample.merge(directors_grouped, on='id', how='left')

# Replace NaN values in 'director' column with an empty string
movies_sample['director'] = movies_sample['director'].fillna('')

# Display the updated movies_sample DataFrame
print(movies_sample)

            id                               name  date  \
0      1000001                             Barbie  2023   
1      1000002                           Parasite  2019   
2      1000003  Everything Everywhere All at Once  2022   
3      1000004                         Fight Club  1999   
4      1000005                         La La Land  2016   
...        ...                                ...   ...   
36956  1152403                         Karmayodha  2012   
36957  1152850                            Staging  2021   
36958  1158485                             Kaalai  2008   
36959  1161264                      Fading Petals  2022   
36960  1164849                   To Steal a Thief  1996   

                                                 tagline  \
0                       She's everything. He's just Ken.   
1                            Act like you own the place.   
2       The universe is so much bigger than you realize.   
3                                Mischief. Mayhem. 

In [23]:
data_inspection.show_missing_values(movies_sample)


Missing Values in Columns:
id             0
name           0
date           0
tagline        0
description    0
minute         0
rating         0
genres         0
language       0
country        0
theme          0
director       0
dtype: int64


In [16]:
display(movies_sample)

Unnamed: 0,id,name,date,tagline,description,minute,rating,genres,language,country,theme
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86,"Comedy, Adventure",English,"UK, USA","[Humanity and the world around us, Crude humor..."
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56,"Comedy, Thriller, Drama","Korean, English, German, Korean",South Korea,"[Humanity and the world around us, Intense vio..."
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.30,"Science Fiction, Adventure, Comedy, Action","English, Cantonese, Chinese, English",USA,"[Humanity and the world around us, Moving rela..."
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Drama,English,"Germany, USA","[Intense violence and sexual transgression, Hu..."
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","[Song and dance, Humanity and the world around..."
...,...,...,...,...,...,...,...,...,...,...,...
36956,1152403,Karmayodha,2012,Bad ones beware.,"An investigative thriller from Major Ravi, wit...",123,2.78,"Crime, Action, Thriller","Malayalam, Hindi, English, Malayalam",India,[No themes]
36957,1152850,Staging,2021,Staging,"Assad, along with his group, creates fake acci...",96,2.66,"Drama, Crime",Persian (Farsi),Iran,[No themes]
36958,1158485,Kaalai,2008,There is no substitute,"A man comes to the city, seeking revenge from ...",133,2.75,Action,"Tamil, Hindi, Tamil",India,[No themes]
36959,1161264,Fading Petals,2022,The tumultuous relationship between a Young Wo...,The paths of two solitary figures cross when a...,93,3.11,Drama,English,UK,[No themes]


In [19]:
data_inspection.show_column_value_counts(movies_sample, 'genres')


Value counts for column genres:
genres
Drama                                           2799
Documentary                                     1818
Comedy                                          1749
Horror                                          1457
Comedy, Drama                                    750
                                                ... 
Adventure, TV Movie, Drama, Science Fiction        1
Animation, Thriller, Science Fiction               1
War, Drama, Romance, Comedy                        1
Family, Fantasy, Romance, Adventure                1
Animation, Thriller, Science Fiction, Horror       1
Name: count, Length: 5037, dtype: int64


In [25]:
data_inspection.show_column_values(movies_sample, 'director')


Unique values in column director:
['Greta Gerwig' 'Bong Joon-ho' 'Daniel Scheinert, Daniel Kwan' ...
 'Tarun Gopi' 'Bradley Charlton' 'Clas Lindberg']


In [26]:
data_inspection.show_column_values(movies_sample, 'genre')


Column genre does not exist in the DataFrame.


In [29]:
# Count rows where 'theme' column contains exactly ['No themes']
no_themes_count = movies_sample['theme'].apply(lambda x: x == ['No themes']).sum()

# Display the count
print(f"Number of rows with ['No themes']: {no_themes_count}")


Number of rows with ['No themes']: 18472


In [33]:
# Filter out rows where 'theme' column contains ['No themes']
clean_letterboxd_movies = movies_sample[movies_sample['theme'].apply(lambda x: x != ['No themes'])].copy()

# Display the shape and a preview of the cleaned DataFrame
print(f"Number of rows after cleaning: {clean_letterboxd_movies.shape[0]}")
print(clean_letterboxd_movies.head())


Number of rows after cleaning: 18489
        id                               name  date  \
0  1000001                             Barbie  2023   
1  1000002                           Parasite  2019   
2  1000003  Everything Everywhere All at Once  2022   
3  1000004                         Fight Club  1999   
4  1000005                         La La Land  2016   

                                            tagline  \
0                  She's everything. He's just Ken.   
1                       Act like you own the place.   
2  The universe is so much bigger than you realize.   
3                           Mischief. Mayhem. Soap.   
4                    Here's to the fools who dream.   

                                         description  minute  rating  \
0  Barbie and Ken are having the time of their li...     114    3.86   
1  All unemployed, Ki-taek's family takes peculia...     133    4.56   
2  An aging Chinese immigrant is swept up in an i...     140    4.30   
3  A ticking-

In [None]:
def rename_columns(df, rename_dict):
    """Rename columns based on a dictionary."""
    return df.rename(columns=rename_dict)

In [45]:
rename_dict = {
    'id' : 'letterboxd_id',
    'name' : 'title',
    'date' : 'release_year',
    'description' : 'summary',
    'rating' : 'letterboxd_rating',
    'minute' : 'runtime',
    'theme' : 'themes'
}

clean_letterboxd_movies = data_cleaning.rename_columns(clean_letterboxd_movies, rename_dict)

In [46]:

display(clean_letterboxd_movies)

Unnamed: 0,letterboxd_id,title,release_year,tagline,summary,runtime,letterboxd_rating,genres,language,country,themes,director
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86,"Comedy, Adventure",English,"UK, USA","[Humanity and the world around us, Crude humor...",Greta Gerwig
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56,"Comedy, Thriller, Drama","Korean, English, German, Korean",South Korea,"[Humanity and the world around us, Intense vio...",Bong Joon-ho
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.30,"Science Fiction, Adventure, Comedy, Action","English, Cantonese, Chinese, English",USA,"[Humanity and the world around us, Moving rela...","Daniel Scheinert, Daniel Kwan"
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Drama,English,"Germany, USA","[Intense violence and sexual transgression, Hu...",David Fincher
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","[Song and dance, Humanity and the world around...",Damien Chazelle
...,...,...,...,...,...,...,...,...,...,...,...,...
18484,1119169,Mumbai Diaries,2021,"Minutes to respond, seconds to react, an insta...",A medical thriller set in the Emergency Room o...,738,3.52,"Mystery, Drama","Hindi, Spanish, Hindi",India,"[High speed and special ops, Bollywood emotion...",
18485,1119929,Skybound,2017,What if suddenly the ground was gone?,Five friends on a small airplane mysteriously ...,82,2.71,"Thriller, Action",English,Germany,"[High speed and special ops, Air pilot heroism...",Alex Tavakoli
18486,1129624,Panchayat,2020,total length of the series is 1640 minutes til...,"An engineering graduate, Abhishek, is unable t...",999,3.73,"Drama, Comedy",Hindi,India,"[Moving relationship stories, Bollywood emotio...",Deepak Kumar Mishra
18487,1138639,Punjab Nahi Jaungi,2017,I won't go to Pakistan,A man vows to win the heart of the woman he lo...,159,3.03,"Comedy, Romance","Urdu, Eastern Punjabi, Eastern Panjabi, Urdu",Pakistan,"[Crude humor and satire, Quirky and endearing ...",Nadeem Baig


In [None]:
# Reset the index of the cleaned DataFrame
clean_letterboxd_movies.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame to a CSV file
# clean_letterboxd_movies.to_csv('../data/clean/clean_letterboxd_movies.csv', index=False)

# Confirm the process
print("CSV file 'clean_letterboxd_movies.csv' has been created successfully.")


CSV file 'clean_letterboxd_movies.csv' has been created successfully.


In [48]:
clean_films_id = pd.read_csv('../data/clean/clean_films_id.csv')

In [50]:
# Find common titles in both DataFrames
common_titles = clean_letterboxd_movies[clean_letterboxd_movies['title'].isin(clean_films_id['title'])]

# Count how many titles match
matching_titles_count = common_titles.shape[0]

# Display the result
print(f"Number of matching titles: {matching_titles_count}")


Number of matching titles: 7752


In [51]:
# Find titles in clean_letterboxd_movies that are NOT in clean_films_id
non_matching_titles = clean_letterboxd_movies[~clean_letterboxd_movies['title'].isin(clean_films_id['title'])].copy()

# Display the shape and a preview of the non-matching titles DataFrame
print(f"Number of non-matching titles: {non_matching_titles.shape[0]}")
print(non_matching_titles.head())


Number of non-matching titles: 10737
    letterboxd_id                           title  release_year  \
4         1000005                      La La Land          2016   
11        1000012                        Whiplash          2014   
32        1000033  Once Upon a Time… in Hollywood          2019   
39        1000041                     Glass Onion          2022   
68        1000071                            Coco          2017   

                                            tagline  \
4                    Here's to the fools who dream.   
11  The road to greatness can take you to the edge.   
32       In this town, it can all change… like that   
39          When the game ends, the mystery begins.   
68                    The celebration of a lifetime   

                                              summary  runtime  \
4   Mia, an aspiring actress, serves lattes to mov...      129   
11  Under the direction of a ruthless instructor, ...      107   
32  Los Angeles, 1969. TV star 

In [52]:
non_matching_titles.to_csv('../data/local/clean/non_matching_titles_letterboxd.csv', index=False)
