# Merge Data Sources

In [None]:
import pandas as pd
import re
from scripts.utilites import column_stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
rt_movies = pd.read_csv("merged_data/rotten_tomatoes_movies.csv", low_memory=False)
movies_metadata = pd.read_csv("merged_data/movies_metadata.csv", low_memory=False)
wiki_plots = pd.read_csv("merged_data/wiki_movie_plots.csv", low_memory=False)

In [None]:
def standardize_column_names(df):
    def to_snake_case(name):
        name = re.sub(r'[\s/-]+', '_', name)
        name = re.sub(r'([a-z])([A-Z])', r'\1_\2', name)
        name = re.sub(r'__+', '_', name)
        return name.lower().strip('_')

    df.columns = [to_snake_case(col) for col in df.columns]
    return df

In [None]:
rt_movies = standardize_column_names(rt_movies)
movies_metadata = standardize_column_names(movies_metadata)
wiki_plots = standardize_column_names(wiki_plots)

display(wiki_plots[wiki_plots["title"].str.contains("Star Wars", case=False, na=False)])
display(movies_metadata[movies_metadata["title"].str.contains("Star Wars", case=False, na=False)])

In [None]:
def standardize_titles(df, column="title"):
    df[column] = df[column].str.lower().str.strip()
    df[column] = df[column].apply(lambda x: ' '.join(re.sub(r'[^a-z0-9\s]', '', x).split()) if isinstance(x, str) else x)
    return df

In [None]:
rt_movies = standardize_titles(rt_movies)
movies_metadata = standardize_titles(movies_metadata)
wiki_plots = standardize_titles(wiki_plots)

In [None]:
merged_df = rt_movies.merge(
    movies_metadata,
    how='outer',
    left_on=['title', ],
    right_on=['title', ])

display(merged_df[merged_df["title"].str.contains("Star Wars", case=False, na=False)])

display(column_stats(merged_df))

In [None]:
merged_df = merged_df.merge(
    wiki_plots,
    how='outer',
    on=['title', 'director'])

display(merged_df[merged_df["title"].str.contains("Star Wars", case=False, na=False)])


In [None]:
merged_df.head()

In [None]:
# Drop unlabeled data
merged_df.dropna(subset=['box_office', 'wiki_page'], inplace=True, how='all')

column_stats(merged_df)

In [None]:
merged_df.to_csv("./merged_data/merged.csv", index=False)