In [20]:
import pandas as pd

df_movies = pd.read_csv('data/rotten_tomatoes_movies.csv')
df_reviews = pd.read_csv('data/rotten_tomatoes_movie_reviews.csv')
df_wiki = pd.read_csv('data/wiki_movie_plots.csv')
df_metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)

In [21]:
def column_stats(df):
    stats = pd.DataFrame({
        "Data Type": df.dtypes,
        "Count": df.count(),
        "Missing Values": df.isna().sum(),
        "Missing Value Rate": df.isna().sum() / df.shape[0] * 100,
        "Unique Values": df.nunique(),
        "Unique Rate": df.nunique() / df.shape[0] * 100,
        "Mean": df.select_dtypes(include=["number"]).mean(),
        "Median": df.select_dtypes(include=["number"]).median(),
        "Mode": df.mode().iloc[0],
        "Min": df.select_dtypes(include=["number"]).min(),
        "Max": df.select_dtypes(include=["number"]).max(),
    })
    return stats

In [22]:
display(column_stats(df_movies))
display(column_stats(df_reviews))
display(column_stats(df_wiki))
display(column_stats(df_metadata))

Unnamed: 0,Data Type,Count,Missing Values,Missing Value Rate,Unique Values,Unique Rate,Mean,Median,Mode,Min,Max
audienceScore,float64,73248,70010,48.869871,101,0.070502,55.674967,57.0,50.0,0.0,100.0
boxOffice,object,14743,128515,89.708777,4863,3.394575,,,$1.1M,,
director,object,139041,4217,2.94364,62206,43.422357,,,Unknown Director,,
distributor,object,23001,120257,83.944352,3693,2.577867,,,Paramount Pictures,,
genre,object,132175,11083,7.736392,2912,2.032696,,,Drama,,
id,object,143258,0,0.0,142052,99.158162,,,$5_a_day,,
originalLanguage,object,129400,13858,9.673456,112,0.078181,,,English,,
rating,object,13991,129267,90.233704,10,0.00698,,,R,,
ratingContents,object,13991,129267,90.233704,8353,5.830739,,,['Language'],,
releaseDateStreaming,object,79420,63838,44.56156,4726,3.298943,,,2017-05-22,,


Unnamed: 0,Data Type,Count,Missing Values,Missing Value Rate,Unique Values,Unique Rate,Mean,Median,Mode,Min,Max
creationDate,object,1444963,0,0.0,8510,0.588942,,,2000-01-01,,
criticName,object,1444963,0,0.0,15510,1.073384,,,Emanuel Levy,,
id,object,1444963,0,0.0,69263,4.79341,,,parasite_2019,,
isTopCritic,bool,1444963,0,0.0,2,0.000138,,,False,,
originalScore,object,1009745,435218,30.119664,1729,0.119657,,,3/5,,
publicatioName,object,1444963,0,0.0,2707,0.18734,,,New York Times,,
reviewId,int64,1444963,0,0.0,1432569,99.142262,9035203.0,2200337.0,7726,1.0,102796154.0
reviewState,object,1444963,0,0.0,2,0.000138,,,fresh,,
reviewText,object,1375738,69225,4.79078,1359771,94.104209,,,Parental Content Review,,
reviewUrl,object,1234038,210925,14.59726,1138350,78.780564,,,http://www.jackiekcooper.com,,


Unnamed: 0,Data Type,Count,Missing Values,Missing Value Rate,Unique Values,Unique Rate,Mean,Median,Mode,Min,Max
Cast,object,33464,1422,4.076134,32182,92.24904,,,Tom and Jerry,,
Director,object,34886,0,0.0,12593,36.097575,,,Unknown,,
Genre,object,34886,0,0.0,2265,6.492576,,,unknown,,
Origin/Ethnicity,object,34886,0,0.0,24,0.068796,,,American,,
Plot,object,34886,0,0.0,33869,97.08479,,,"(マッスル人参争奪！超人大戦争, Massuru Ninjin Soudatsu! Chou...",,
Release Year,int64,34886,0,0.0,117,0.335378,1981.314252,1988.0,2013.0,1901.0,2017.0
Title,object,34886,0,0.0,32432,92.96566,,,Cinderella,,
Wiki Page,object,34886,0,0.0,34070,97.660953,,,https://en.wikipedia.org/wiki/Digimon_Adventur...,,


Unnamed: 0,Data Type,Count,Missing Values,Missing Value Rate,Unique Values,Unique Rate,Mean,Median,Mode,Min,Max
adult,object,45466,0,0.0,5,0.010997,,,False,,
belongs_to_collection,object,4494,40972,90.115691,1698,3.734659,,,"{'id': 415931, 'name': 'The Bowery Boys', 'pos...",,
budget,object,45466,0,0.0,1226,2.69652,,,0,,
genres,object,45466,0,0.0,4069,8.949545,,,"[{'id': 18, 'name': 'Drama'}]",,
homepage,object,7782,37684,82.883913,7673,16.876347,,,http://www.georgecarlin.com,,
id,object,45466,0,0.0,45436,99.934017,,,141971,,
imdb_id,object,45449,17,0.037391,45417,99.892227,,,0,,
original_language,object,45455,11,0.024194,92,0.202349,,,en,,
original_title,object,45466,0,0.0,43373,95.39656,,,Alice in Wonderland,,
overview,object,44512,954,2.098271,44307,97.450842,,,No overview found.,,


In [23]:
df_movies["title"] = df_movies["title"].str.strip().str.lower()
df_wiki["title"] = df_wiki["Title"].str.strip().str.lower()
df_metadata["title"] = df_metadata["title"].str.strip().str.lower()

# Merge DataFrames on the cleaned title column using inner joins
df_joined = (
    df_movies
    .merge(df_wiki, on=["title" , ], how="inner")
    .merge(df_metadata, on=["title"], how="inner")
)

review_counts = df_reviews.groupby('id').size().reset_index(name='review_count')
df_joined = df_joined.merge(review_counts, left_on='id_x', right_on='id', how='left')

df_joined.head()
print(df_joined.shape)

(49763, 49)


In [24]:
display(column_stats(df_joined))

Unnamed: 0,Data Type,Count,Missing Values,Missing Value Rate,Unique Values,Unique Rate,Mean,Median,Mode,Min,Max
Cast,object,48101,1662,3.339831,15068,30.279525,,,"Kenneth Branagh, Derek Jacobi, Julie Christie,...",,
Director,object,49763,0,0.0,6213,12.48518,,,Unknown,,
Genre,object,49763,0,0.0,1240,2.491811,,,drama,,
Origin/Ethnicity,object,49763,0,0.0,20,0.040191,,,American,,
Plot,object,49763,0,0.0,15397,30.940659,,,"After losing her mother at a young age, Ella p...",,
Release Year,int64,49763,0,0.0,113,0.227076,1981.188,1990.0,2012.0,1902.0,2017.0
Title,object,49763,0,0.0,14015,28.163495,,,Cinderella,,
Wiki Page,object,49763,0,0.0,15421,30.988887,,,https://en.wikipedia.org/wiki/Cinderella_(2015...,,
adult,object,49763,0,0.0,1,0.00201,,,False,,
audienceScore,float64,37680,12083,24.281092,100,0.200953,57.23769,59.0,50.0,0.0,100.0


In [26]:
df_joined.dropna(subset=['boxOffice'], inplace=True)

In [27]:
display(column_stats(df_joined))

Unnamed: 0,Data Type,Count,Missing Values,Missing Value Rate,Unique Values,Unique Rate,Mean,Median,Mode,Min,Max
Cast,object,11735,284,2.362925,7144,59.439221,,,Bill Condon (director/screenplay); Evan Spilio...,,
Director,object,12019,0,0.0,3645,30.326982,,,Unknown,,
Genre,object,12019,0,0.0,749,6.2318,,,drama,,
Origin/Ethnicity,object,12019,0,0.0,20,0.166403,,,American,,
Plot,object,12019,0,0.0,7153,59.514103,,,"After losing her mother at a young age, Ella p...",,
Release Year,int64,12019,0,0.0,107,0.890257,1994.003,2000.0,2011.0,1903.0,2017.0
Title,object,12019,0,0.0,6349,52.824694,,,Cinderella,,
Wiki Page,object,12019,0,0.0,7160,59.572344,,,https://en.wikipedia.org/wiki/Cinderella_(2015...,,
adult,object,12019,0,0.0,1,0.00832,,,False,,
audienceScore,float64,11924,95,0.790415,94,0.782095,60.03028,61.0,80.0,0.0,100.0
