Clean and handle letterboxd data
- countries.csv
- genres.csv
- languages.csv
- movies.csv
- themes.csv

In [62]:
import pandas as pd
import numpy as np

In [63]:
import sys
sys.path.append('../utils')
import data_cleaning
import data_inspection
import helpers

In [None]:
genres = pd.read_csv('../data/local/letterboxd/genres.csv')
countries = pd.read_csv('../data/local/letterboxd/countries.csv')
languages = pd.read_csv('../data/local/letterboxd/languages.csv')
movies = pd.read_csv('../data/local/letterboxd/movies.csv')
themes = pd.read_csv('../data/local/letterboxd/themes.csv')
crew = pd.read_csv('../data/local/letterboxd/crew.csv')

In [None]:
display(movies)

In [None]:
data_inspection.show_basic_info(movies)

In [None]:
data_inspection.check_for_duplicates(movies)

In [None]:
# convert rows to int
columns_to_convert = ['date', 'minute']
data_cleaning.convert_columns_to_int(movies, columns_to_convert)

In [None]:
# get sample df with movies with no empty rows
movies_sample = movies.dropna()

display(movies_sample)

In [None]:
# drop rows with runtime under 40'
helpers.drop_rows_by_runtime(movies_sample, 'minute', 40)

In [None]:
data_inspection.show_missing_values(themes)

In [69]:
# Group genres by 'id' and join the genre values into a comma-separated string
genres_grouped = genres.groupby('id')['genre'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'genres' for clarity
genres_grouped.rename(columns={'genre': 'genres'}, inplace=True)

# Merge the grouped genres into movies_sample based on 'id'
movies_sample = movies_sample.merge(genres_grouped, on='id', how='left')

# Replace NaN values in 'genres' column with an empty string if needed
movies_sample['genres'] = movies_sample['genres'].fillna('')

In [70]:
# Group genres by 'id' and join the genre values into a comma-separated string
language_grouped = languages.groupby('id')['language'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'language' for clarity
language_grouped.rename(columns={'language': 'language'}, inplace=True)

# Merge the grouped language into movies_sample based on 'id'
movies_sample = movies_sample.merge(language_grouped, on='id', how='left')

# Replace NaN values in 'language' column with an empty string if needed
movies_sample['language'] = movies_sample['language'].fillna('')

In [71]:
# Group genres by 'id' and join the genre values into a comma-separated string
countries_grouped = countries.groupby('id')['country'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'countries' for clarity
countries_grouped.rename(columns={'country': 'country'}, inplace=True)

# Merge the grouped countries into movies_sample based on 'id'
movies_sample = movies_sample.merge(countries_grouped, on='id', how='left')

# Replace NaN values in 'countries' column with an empty string if needed
movies_sample['country'] = movies_sample['country'].fillna('')

In [72]:
# Group themes by 'id' and combine the 'theme' column values into a list
themes_grouped = themes.groupby('id')['theme'].apply(lambda x: list(x)).reset_index()

# Merge the grouped themes into movies_sample based on 'id'
movies_sample = movies_sample.merge(themes_grouped, on='id', how='left')

# Replace NaN values in 'theme' column with a list containing a single placeholder or a default message
movies_sample['theme'] = movies_sample['theme'].apply(lambda x: x if isinstance(x, list) else ['No themes'])

In [None]:
# Filter for rows where the role is 'Director'
directors_df = crew[crew['role'] == 'Director']

# Group by 'id' and join director names into a comma-separated string
directors_grouped = directors_df.groupby('id')['name'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'name' column to 'director' for clarity
directors_grouped.rename(columns={'name': 'director'}, inplace=True)

# Merge the grouped directors into movies_sample based on 'id'
movies_sample = movies_sample.merge(directors_grouped, on='id', how='left')

# Replace NaN values in 'director' column with an empty string
movies_sample['director'] = movies_sample['director'].fillna('')

# Display the updated movies_sample DataFrame
print(movies_sample)

In [None]:
data_inspection.show_missing_values(movies_sample)

In [None]:
display(movies_sample)

In [None]:
data_inspection.show_column_value_counts(movies_sample, 'genres')

In [None]:
data_inspection.show_column_values(movies_sample, 'director')

In [None]:
data_inspection.show_column_values(movies_sample, 'genre')

In [None]:
# Count rows where 'theme' column contains exactly ['No themes']
no_themes_count = movies_sample['theme'].apply(lambda x: x == ['No themes']).sum()

# Display the count
print(f"Number of rows with ['No themes']: {no_themes_count}")

In [None]:
# Filter out rows where 'theme' column contains ['No themes']
clean_letterboxd_movies = movies_sample[movies_sample['theme'].apply(lambda x: x != ['No themes'])].copy()

# Display the shape and a preview of the cleaned DataFrame
print(f'Number of rows after cleaning: {clean_letterboxd_movies.shape[0]}')
print(clean_letterboxd_movies.head())


In [81]:
rename_dict = {
    'id' : 'letterboxd_id',
    'name' : 'title',
    'date' : 'release_year',
    'description' : 'summary',
    'rating' : 'letterboxd_rating',
    'minute' : 'runtime',
    'theme' : 'themes',
    'country' : 'countries'
}

clean_letterboxd_movies = data_cleaning.rename_columns(clean_letterboxd_movies, rename_dict)

In [None]:
display(clean_letterboxd_movies)

In [None]:
# Display the column names in the DataFrame
print(clean_letterboxd_movies.columns)


In [None]:
# Define the new column order for letterboxd_clean
new_column_order_letterboxd = [
    'letterboxd_id', 'title', 'release_year', 'tagline', 'summary',
    'runtime', 'letterboxd_rating', 'genres', 'language', 'countries',
    'themes', 'director'
]

# Apply the new column order to letterboxd_clean
clean_letterboxd_movies = clean_letterboxd_movies[new_column_order_letterboxd]

# Display the updated clean_letterboxd_movies DataFrame columns
print('Columns in clean_letterboxd_movies after reordering:', clean_letterboxd_movies.columns)


In [None]:
# Reset the index of the cleaned DataFrame
clean_letterboxd_movies.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame to a CSV file
# clean_letterboxd_movies.to_csv('../data/clean/letterboxd_clean_films.csv', index=False)

# Confirm the process
print('CSV file has been created successfully.')


In [48]:
clean_films_id = pd.read_csv('../data/clean/clean_films_id.csv')

In [None]:
display(clean_films_id)

In [None]:
# Find common titles in both DataFrames
common_titles = clean_letterboxd_movies[clean_letterboxd_movies['title'].isin(clean_films_id['title'])]

# Count how many titles match
matching_titles_count = common_titles.shape[0]

# Display the result
print(f'Number of matching titles: {matching_titles_count}')


In [None]:
# Find titles in clean_letterboxd_movies that are NOT in clean_films_id
non_matching_titles = clean_letterboxd_movies[~clean_letterboxd_movies['title'].isin(clean_films_id['title'])].copy()

# Display the shape and a preview of the non-matching titles DataFrame
print(f'Number of non-matching titles: {non_matching_titles.shape[0]}')
print(non_matching_titles.head())


In [52]:
non_matching_titles.to_csv('../data/local/clean/non_matching_titles_letterboxd.csv', index=False)


In [None]:
display(non_matching_titles)