# Merge Data Sources

In [None]:
import pandas as pd
import os
from IPython.core.display import Markdown
import re
from utilites import column_stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [None]:
rt_movies = pd.read_csv("./deduped_data/rotten_tomatoes_movies.csv", low_memory=False)
movies_metadata = pd.read_csv("./deduped_data/movies_metadata.csv", low_memory=False)
wiki_plots = pd.read_csv("./deduped_data/wiki_movie_plots.csv", low_memory=False)

In [None]:
def standardize_column_names(df):
    def to_snake_case(name):
        name = re.sub(r'[\s/-]+', '_', name)
        name = re.sub(r'([a-z])([A-Z])', r'\1_\2', name)
        name = re.sub(r'__+', '_', name)
        return name.lower().strip('_')

    df.columns = [to_snake_case(col) for col in df.columns]
    return df

In [None]:
rt_movies = standardize_column_names(rt_movies)
movies_metadata = standardize_column_names(movies_metadata)
wiki_plots = standardize_column_names(wiki_plots)

In [None]:
merged_df = rt_movies.merge(
    movies_metadata,
    how='left',
    left_on=['title', 'release_date_theaters'],
    right_on=['title', 'release_date'])

print(merged_df.shape)

In [None]:
merged_df = merged_df.merge(
    wiki_plots,
    how='left',
    on=['title', 'director'])

In [None]:
merged_df.head()

In [None]:
column_stats(merged_df)

In [None]:
merged_df.to_csv("./merged_data/merged.csv", index=False)