Clean and handle letterboxd data
- countries.csv
- genres.csv
- languages.csv
- movies.csv
- themes.csv

In [78]:
import pandas as pd
import re
import numpy as np

In [79]:
import sys
sys.path.append('../utils')
import data_cleaning
import data_inspection
import helpers

In [80]:
genres = pd.read_csv('../data/local/letterboxd/genres.csv')
countries = pd.read_csv('../data/local/letterboxd/countries.csv')
languages = pd.read_csv('../data/local/letterboxd/languages.csv')
movies = pd.read_csv('../data/local/letterboxd/movies.csv')
themes = pd.read_csv('../data/local/letterboxd/themes.csv')
crew = pd.read_csv('../data/local/letterboxd/crew.csv')

In [None]:
display(movies)

In [None]:
data_inspection.show_basic_info(movies)

In [None]:
data_inspection.check_for_duplicates(movies)

In [None]:
# convert rows to int
columns_to_convert = ['date', 'minute']
data_cleaning.convert_columns_to_int(movies, columns_to_convert)

In [None]:
# get sample df with movies with no empty rows
movies_sample = movies.dropna()

display(movies_sample)

In [None]:
# drop rows with runtime under 40'
helpers.drop_rows_by_runtime(movies_sample, 'minute', 40)

In [None]:
data_inspection.show_missing_values(themes)

In [88]:
# Group genres by 'id' and join the genre values into a comma-separated string
genres_grouped = genres.groupby('id')['genre'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'genres' for clarity
genres_grouped.rename(columns={'genre': 'genres'}, inplace=True)

# Merge the grouped genres into movies_sample based on 'id'
movies_sample = movies_sample.merge(genres_grouped, on='id', how='left')

# Replace NaN values in 'genres' column with an empty string if needed
movies_sample['genres'] = movies_sample['genres'].fillna('')

In [89]:
# Group genres by 'id' and join the genre values into a comma-separated string
language_grouped = languages.groupby('id')['language'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'language' for clarity
language_grouped.rename(columns={'language': 'language'}, inplace=True)

# Merge the grouped language into movies_sample based on 'id'
movies_sample = movies_sample.merge(language_grouped, on='id', how='left')

# Replace NaN values in 'language' column with an empty string if needed
movies_sample['language'] = movies_sample['language'].fillna('')

In [90]:
# Group genres by 'id' and join the genre values into a comma-separated string
countries_grouped = countries.groupby('id')['country'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'genre' column to 'countries' for clarity
countries_grouped.rename(columns={'country': 'country'}, inplace=True)

# Merge the grouped countries into movies_sample based on 'id'
movies_sample = movies_sample.merge(countries_grouped, on='id', how='left')

# Replace NaN values in 'countries' column with an empty string if needed
movies_sample['country'] = movies_sample['country'].fillna('')

In [91]:
# Group themes by 'id' and combine the 'theme' column values into a list
themes_grouped = themes.groupby('id')['theme'].apply(lambda x: list(x)).reset_index()

# Merge the grouped themes into movies_sample based on 'id'
movies_sample = movies_sample.merge(themes_grouped, on='id', how='left')

# Replace NaN values in 'theme' column with a list containing a single placeholder or a default message
movies_sample['theme'] = movies_sample['theme'].apply(lambda x: x if isinstance(x, list) else ['No themes'])

In [None]:
# Filter for rows where the role is 'Director'
directors_df = crew[crew['role'] == 'Director']

# Group by 'id' and join director names into a comma-separated string
directors_grouped = directors_df.groupby('id')['name'].apply(lambda x: ', '.join(x)).reset_index()

# Rename 'name' column to 'director' for clarity
directors_grouped.rename(columns={'name': 'director'}, inplace=True)

# Merge the grouped directors into movies_sample based on 'id'
movies_sample = movies_sample.merge(directors_grouped, on='id', how='left')

# Replace NaN values in 'director' column with an empty string
movies_sample['director'] = movies_sample['director'].fillna('')

# Display the updated movies_sample DataFrame
print(movies_sample)

In [None]:
data_inspection.show_missing_values(movies_sample)

In [None]:
display(movies_sample)

In [None]:
data_inspection.show_column_value_counts(movies_sample, 'genres')

In [None]:
data_inspection.show_column_values(movies_sample, 'director')

In [None]:
data_inspection.show_column_values(movies_sample, 'genre')

In [None]:
# filter out rows where 'theme' column contains ['No themes']
clean_letterboxd_movies = movies_sample[movies_sample['theme'].apply(lambda x: x != ['No themes'])].copy()

print(f'Number of rows after cleaning: {clean_letterboxd_movies.shape[0]}')
print(clean_letterboxd_movies.head())

In [99]:
rename_dict = {
    'id' : 'letterboxd_id',
    'name' : 'title',
    'date' : 'release_year',
    'description' : 'summary',
    'rating' : 'letterboxd_rating',
    'minute' : 'runtime',
    'theme' : 'themes',
    'country' : 'countries'
}

clean_letterboxd_movies = data_cleaning.rename_columns(clean_letterboxd_movies, rename_dict)

In [100]:
# reorder columns
new_column_order_letterboxd = [
    'letterboxd_id', 'title', 'release_year', 'tagline', 'summary',
    'runtime', 'letterboxd_rating', 'genres', 'language', 'countries',
    'themes', 'director'
]

clean_letterboxd_movies = clean_letterboxd_movies[new_column_order_letterboxd]

In [None]:
clean_letterboxd_movies.reset_index(drop=True, inplace=True)

## df to csv
# clean_letterboxd_movies.to_csv('../data/clean/letterboxd_clean_films.csv', index=False)

print('CSV file has been created successfully.')

#### Add Data from TMDb df and backup df from get request

In [102]:
letterboxd_backup = pd.read_csv('../data/local/letterboxd/letterboxd_request_secondbatch.csv')
tmdb_df = pd.read_csv('../data/clean/tmdb_clean_films.csv')

In [None]:
letterboxd_backup.head(2)

In [None]:
tmdb_df.head(2)

In [None]:
data_inspection.check_for_duplicates(tmdb_df)

In [None]:
clean_letterboxd_movies.head(2)

In [None]:
# merge on 'letterboxd_id'
clean_letterboxd_movies = clean_letterboxd_movies.merge(
    letterboxd_backup[['letterboxd_id', 'topics', 'doesthedog_id']],
    on='letterboxd_id', 
    how='left'
)

print(clean_letterboxd_movies.head())

In [None]:
# clean titles and remove hyphens
def clean_title(title):
    # Replace ' - ' with a space
    title = title.replace(' - ', ' ')
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
    title = ' '.join(title.split())
    return title

clean_letterboxd_movies['title'] = clean_letterboxd_movies['title'].apply(clean_title)
print(clean_letterboxd_movies['title'].head(3))

In [None]:
# handle duplicates in 'tmdb_df' by keeping the first occurrence of each 'title'
tmdb_dedup = tmdb_df.drop_duplicates(subset='title', keep='first')

# Update 'doesthedog_id' where NaN by mapping from 'tmdb_dedup'
clean_letterboxd_movies['doesthedog_id'] = clean_letterboxd_movies['doesthedog_id'].combine_first(
    clean_letterboxd_movies['title'].map(tmdb_dedup.set_index('title')['doesthedog_id'])
)

# update empty 'topics' rows
if 'topics' in tmdb_dedup.columns:
    clean_letterboxd_movies['topics'] = clean_letterboxd_movies['topics'].combine_first(
        clean_letterboxd_movies['title'].map(tmdb_dedup.set_index('title')['topics'])
    )
else:
    print("Warning: 'topics' column is missing in tmdb_df. Skipping 'topics' mapping.")

print(clean_letterboxd_movies.head())

In [None]:
data_inspection.show_missing_values(clean_letterboxd_movies)

In [None]:
request_backup = pd.read_csv('../data/local/letterboxd/letterboxd_content_request_unfinished.csv')

In [None]:
# Update 'doesthedog_id' where NaN by mapping from 'request_backup'
clean_letterboxd_movies['doesthedog_id'] = clean_letterboxd_movies['doesthedog_id'].combine_first(
    clean_letterboxd_movies['letterboxd_id'].map(request_backup.set_index('letterboxd_id')['doesthedog_id'])
)

# update empty 'topics' rows
if 'topics' in request_backup.columns:
    clean_letterboxd_movies['topics'] = clean_letterboxd_movies['topics'].combine_first(
        clean_letterboxd_movies['letterboxd_id'].map(request_backup.set_index('letterboxd_id')['topics'])
    )
else:
    print("Warning: 'topics' column is missing in tmdb_df. Skipping 'topics' mapping.")

print(clean_letterboxd_movies.head())

In [None]:
# get new df from empty id rows
missing_doesthedog_df = clean_letterboxd_movies[clean_letterboxd_movies['doesthedog_id'].isna()]

print(missing_doesthedog_df.head())
print(f"Number of rows with empty 'doesthedog_id': {len(missing_doesthedog_df)}")

## convert to .csv
# missing_doesthedog_df.to_csv('../data/local/raw/pending_letterboxd_films_for_request.csv')

In [115]:
clean_letterboxd_movies['title'] = clean_letterboxd_movies['title'].apply(clean_title)

In [116]:
# clean_letterboxd_movies.to_csv('../data/clean/letterboxd_clean_films.csv', index=False)