Clean and handle letterboxd data
- countries.csv
- genres.csv
- languages.csv
- movies.csv
- themes.csv

In [1]:
import pandas as pd
import re

In [2]:
import sys
sys.path.append('../utils')
import data_cleaning
import data_inspection
import helpers

In [3]:
genres = pd.read_csv('../data/local/letterboxd/genres.csv')
countries = pd.read_csv('../data/local/letterboxd/countries.csv')
languages = pd.read_csv('../data/local/letterboxd/languages.csv')
movies = pd.read_csv('../data/local/letterboxd/movies.csv')
themes = pd.read_csv('../data/local/letterboxd/themes.csv')
crew = pd.read_csv('../data/local/letterboxd/crew.csv')

In [None]:
data_inspection.show_basic_info(movies)

In [None]:
data_inspection.check_for_duplicates(movies)

In [None]:
# convert rows to int
columns_to_convert = ['date', 'minute']
data_cleaning.convert_columns_to_int(movies, columns_to_convert)

In [None]:
movies_sample = movies.copy()

# get sample df with movies with no empty rows
movies_sample = movies_sample.dropna()

display(movies_sample)

In [None]:
# drop rows with runtime under 40'
helpers.drop_rows_by_runtime(movies_sample, 'minute', 40)

In [9]:
movies_sample = data_cleaning.group_and_join_columns(
    df_main=movies_sample,
    df_to_group=genres,
    group_by_col='id',
    join_col='genre',
    new_col_name='genres',
    separator=', ',
)

In [10]:
movies_sample = data_cleaning.group_and_join_columns(
    df_main=movies_sample,
    df_to_group=languages,
    group_by_col='id',
    join_col='language',
    new_col_name='language',
    separator=', ',
)

In [11]:
movies_sample = data_cleaning.group_and_join_columns(
    df_main=movies_sample,
    df_to_group=countries,
    group_by_col='id',
    join_col='country',
    new_col_name='country',
    separator=', ',
)

In [12]:
themes_grouped = themes.groupby('id')['theme'].apply(lambda x: list(x)).reset_index()

# grouped themes into movies_sample based on 'id'
movies_sample = movies_sample.merge(themes_grouped, on='id', how='left')

movies_sample['theme'] = movies_sample['theme'].apply(lambda x: x if isinstance(x, list) else ['No themes'])

In [13]:
directors = crew[crew['role'] == 'Director']

In [14]:
movies_sample = data_cleaning.group_and_join_columns(
    df_main=movies_sample,
    df_to_group=directors,
    group_by_col='id',
    join_col='name',
    new_col_name='director',
    separator=', ',
    fillna_value='Unknown'
)

In [None]:
data_inspection.show_missing_values(movies_sample)

In [None]:
data_inspection.show_column_value_counts(movies_sample, 'genres')

In [None]:
data_inspection.show_column_values(movies_sample, 'director')

In [None]:
data_inspection.show_column_values(movies_sample, 'genre')

In [None]:
# filter out rows where 'theme' column contains ['No themes']
clean_letterboxd_movies = movies_sample[movies_sample['theme'].apply(lambda x: x != ['No themes'])].copy()

print(f'Number of rows after cleaning: {clean_letterboxd_movies.shape[0]}')

In [20]:
rename_dict = {
    'id' : 'letterboxd_id',
    'name' : 'title',
    'date' : 'release_year',
    'description' : 'summary',
    'rating' : 'letterboxd_rating',
    'minute' : 'runtime',
    'theme' : 'themes',
    'country' : 'countries'
}

clean_letterboxd_movies = data_cleaning.rename_columns(clean_letterboxd_movies, rename_dict)

In [None]:
clean_letterboxd_movies.head()

In [22]:
# reorder columns
new_column_order_letterboxd = [
    'letterboxd_id', 'title', 'release_year', 'tagline', 'summary',
    'runtime', 'letterboxd_rating', 'genres', 'language', 'countries',
    'themes', 'director'
]

clean_letterboxd_movies = clean_letterboxd_movies[new_column_order_letterboxd]

In [None]:
clean_letterboxd_movies.reset_index(drop=True, inplace=True)

# # create csv
# clean_letterboxd_movies.to_csv('../data/clean/letterboxd_clean_films.csv', index=False)

print('CSV file has been created successfully.')

#### **Add Data from TMDb df and backup df from get request**

In [24]:
letterboxd_backup = pd.read_csv('../data/local/letterboxd/letterboxd_request_secondbatch.csv')
tmdb_df = pd.read_csv('../data/clean/tmdb_clean_films.csv')

In [None]:
letterboxd_backup.head(2)

In [None]:
tmdb_df.head(2)

In [None]:
data_inspection.check_for_duplicates(tmdb_df)

In [None]:
clean_letterboxd_movies.head(2)

In [None]:
# merge on 'letterboxd_id'
clean_letterboxd_movies = clean_letterboxd_movies.merge(
    letterboxd_backup[['letterboxd_id', 'topics', 'doesthedog_id']],
    on='letterboxd_id', 
    how='left'
)

print(clean_letterboxd_movies.head())

In [None]:
# clean titles and remove hyphens
def clean_title(title):
    title = title.replace(' - ', ' ')
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
    title = ' '.join(title.split())
    return title

clean_letterboxd_movies['title'] = clean_letterboxd_movies['title'].apply(clean_title)
print(clean_letterboxd_movies['title'].head(3))

In [None]:
# handle ducplicates keeping the first occurrence of each 'title'
tmdb_dedup = tmdb_df.drop_duplicates(subset='title', keep='first')

# update columns
clean_letterboxd_movies = data_cleaning.update_empty_column(
    df_main=clean_letterboxd_movies,
    df_mapping=tmdb_dedup,
    main_column='title',
    mapping_column='doesthedog_id',
    new_column='doesthedog_id'
)

if 'topics' in tmdb_dedup.columns:
    clean_letterboxd_movies = data_cleaning.update_empty_column(
        df_main=clean_letterboxd_movies,
        df_mapping=tmdb_dedup,
        main_column='title',
        mapping_column='topics',
        new_column='topics'
    )
else:
    print('Warning: topics column is missing in tmdb_dedup. Skipping mapping.')

if 'events' in tmdb_dedup.columns:
    clean_letterboxd_movies = data_cleaning.update_empty_column(
        df_main=clean_letterboxd_movies,
        df_mapping=tmdb_dedup,
        main_column='title',
        mapping_column='events',
        new_column='events'
    )
else:
    print('Warning: events column is missing in tmdb_dedup. Skipping mapping.')

print(clean_letterboxd_movies.head())

In [None]:
data_inspection.show_missing_values(clean_letterboxd_movies)

In [33]:
request_backup = pd.read_csv('../data/local/letterboxd/letterboxd_content_request_unfinished.csv')

In [None]:
# update columns
clean_letterboxd_movies = data_cleaning.update_empty_column(
    df_main=clean_letterboxd_movies,
    df_mapping=request_backup,
    main_column='letterboxd_id',
    mapping_column='doesthedog_id',
    new_column='doesthedog_id'
)

if 'topics' in request_backup.columns:
    clean_letterboxd_movies = data_cleaning.update_empty_column(
        df_main=clean_letterboxd_movies,
        df_mapping=request_backup,
        main_column='letterboxd_id',
        mapping_column='topics',
        new_column='topics'
    )
else:
    print('Warning: topics column is missing in request_backup. Skipping mapping.')

print(clean_letterboxd_movies.head())

In [None]:
# get new df from empty id rows
missing_doesthedog_df = clean_letterboxd_movies[clean_letterboxd_movies['doesthedog_id'].isna()]

print(missing_doesthedog_df.head())
print(f'Rows with empty ids: {len(missing_doesthedog_df)}')

# # convert to .csv
# missing_doesthedog_df.to_csv('../data/local/raw/pending_letterboxd_films_for_request.csv')

In [36]:
clean_letterboxd_movies['title'] = clean_letterboxd_movies['title'].apply(clean_title)

In [37]:
# clean_letterboxd_movies.to_csv('../data/clean/letterboxd_clean_films.csv', index=False)