# Datasets merging

In [1]:
import pandas as pd
import string
Data_path = 'Data/'

## Load the movies data

In [2]:
info_df = pd.read_csv(Data_path+'tmdb_5000_movies.csv', parse_dates=['release_date'])
credit_df = pd.read_csv(Data_path+'tmdb_5000_credits.csv')
# create a columns with the year of the coresponding Oscar ceremony for merging latter on
info_df['ceremony_year'] = info_df['release_date'].apply(lambda date: date.year+1)
movie_df = pd.merge(info_df, credit_df.drop(columns={'title'}), left_on='id', right_on='movie_id', how='outer') \
                        .drop(columns=['movie_id'])

Let's check is there are some NAN sneeking around.

In [None]:
# check the different production companies
import ast
tmp = movie_df.copy()
tmp.production_companies = tmp.production_companies.apply(lambda x : [comp['name'] for comp in ast.literal_eval(x)])
tmp = tmp[['title','production_companies']].explode('production_companies')
tmp['production_companies'].value_counts().head(10)

In [3]:
# check if there some nan values
pd.DataFrame(movie_df.isna().sum()).rename(columns={0:'nbr of NAN'})

Unnamed: 0,nbr of NAN
budget,0
genres,0
homepage,3091
id,0
keywords,0
original_language,0
original_title,0
overview,3
popularity,0
production_companies,0


Looks like that two movies are missing the runtime values and one seems to miss a date. Lets find on the web the duration of those two movie and complete the data. We remove the row where the date is missing as most of the values for this movie are not defined. 

In [4]:
# complete the two missing runtime from google search
idx = movie_df[movie_df.runtime.isna()].index.to_list()
durations = [98, 81] # from google search
movie_df.loc[idx, 'runtime'] = durations

In [5]:
# drop the row where the date is ill
movie_df.dropna(subset=['release_date'], inplace=True)

In [6]:
# check if all good
pd.DataFrame(movie_df.isna().sum()).rename(columns={0:'nbr of NAN'})

Unnamed: 0,nbr of NAN
budget,0
genres,0
homepage,3090
id,0
keywords,0
original_language,0
original_title,0
overview,3
popularity,0
production_companies,0


In [7]:
# set the release_year in int
movie_df.release_date = movie_df.release_date.astype(int)

## Load Oscars and Golden Globes data

In [8]:
oscar_df = pd.read_csv(Data_path+'oscars_website.csv').drop(columns='Unnamed: 0')

In [9]:
GG_df = pd.read_csv(Data_path+'goldenglobes_website.csv').drop(columns='Unnamed: 0')

The title may differ by some difference in the case and the presence of different punctuation. In order to ensure a proper merging, we format them in lower case and we remove the punctuation.

In [10]:
def parse_title(s):
    """
    Parse a string : to lowercase, then remove punctuation
    
    INPUT : 
        |---- s : [string] string to parse 
    OUTPUT 
        |---- s_parsed : [string] parsed string
    """
    s = s.lower()
    s = s.translate(str.maketrans('', '', string.punctuation))
    return s

In [11]:
movie_df.title = movie_df.title.apply(parse_title)
oscar_df.Films = oscar_df.Films.apply(parse_title)
GG_df.Films = GG_df.Films.apply(parse_title)

The scrapped Golden Globes title data is not perfectly clean as there are sometime the released year of the movie in the title. In addition, there is a space in the beginnig of the title. Finnaly the movie starting with _The_, _A_, _An_ have this determinant at the end of the title instead of in the begining.

In [12]:
def clean_GoldenGlobes_title(row):
    """
    Clean the title from GoldenGlobes data. Remove first space. 
    Move 'the', 'a', 'an' in front. Remove released year in the name.
    
    INPUT : 
        |---- row : [list] Golden globes data row [Films, Year, Nomination, GoldenGlobes] 
    OUTPUT 
        |---- s_cleaned : [string] cleaned title
    """
    s = row[0]
    year = row[1]
    # strip to remove space in begining 
    s = s.strip()
    
    # put determinant in front if necessary : Ex : Purge The --> The Purge
    word_list = s.split(' ')
    if word_list[-1] in ['the', 'a', 'an', 'la', 'le']:
        word_list.insert(0, word_list.pop()) # put the last item i first place
    
    # remove the year in title : Ex : title (1985) or title - 1985 
    if word_list[-1] in [str(year), str(year-1)]:
        word_list.pop()
    
    s = ' '.join(word_list)
    return s

In [13]:
GG_df.Films = GG_df.apply(clean_GoldenGlobes_title, axis=1)

## Merge the dataframes

Because there are multiple movies with the same name : _Titanic_ of 1997 and _Titanic_ of 1953, We merge the dataframe based on both the movie title and the ceremony year. 

In [14]:
# merge on both title and ceremony_year to differentiate two Titanics movies or two King knog movies from different years
df = pd.merge(movie_df, oscar_df, how='left', left_on=['ceremony_year', 'title'], right_on=['Year', 'Films'])
df[['Oscars','Nominations']] = df[['Oscars','Nominations']].fillna(0)

In [15]:
# merge GoldenGlobes
df = pd.merge(df, GG_df, how='left', \
                        left_on=['ceremony_year', 'title'], \
                        right_on=['Year', 'Films'], \
                        suffixes=['_Oscars', '_GoldenGlobes'])
df[['GoldenGlobes','Nominations_GoldenGlobes']] = df[['GoldenGlobes','Nominations_GoldenGlobes']].fillna(0)

In [16]:
df.drop(columns=['Films_Oscars', 'Year_Oscars', 'Films_GoldenGlobes', 'Year_GoldenGlobes'], inplace=True)

Then we save the merged dataframes

In [17]:
# save merged data
df.to_csv(Data_path+'merged_data.csv')

Extract nodes attributes

In [18]:
nodes_attributes_df = df[['id', 'title', 'budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count', 'ceremony_year', 'Oscars', 'Nominations_Oscars', 'GoldenGlobes', 'Nominations_GoldenGlobes']]
nodes_attributes_df.to_csv(Data_path+'nodes_attributes.csv')

Display The finals dataframes

In [19]:
print('Whole Dataframe')
display(df.iloc[:3,:])
print('Nodes attributes Dataframe')
display(nodes_attributes_df.iloc[:3,:])

Whole Dataframe


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,title,vote_average,vote_count,ceremony_year,cast,crew,Oscars,Nominations_Oscars,GoldenGlobes,Nominations_GoldenGlobes
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,avatar,7.2,11800,2010.0,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",3.0,9.0,2.0,4.0
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,pirates of the caribbean at worlds end,6.9,4500,2008.0,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",0.0,2.0,0.0,0.0
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,spectre,6.3,4466,2016.0,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",1.0,1.0,1.0,1.0


Nodes attributes Dataframe


Unnamed: 0,id,title,budget,popularity,revenue,runtime,vote_average,vote_count,ceremony_year,Oscars,Nominations_Oscars,GoldenGlobes,Nominations_GoldenGlobes
0,19995,avatar,237000000,150.437577,2787965087,162.0,7.2,11800,2010.0,3.0,9.0,2.0,4.0
1,285,pirates of the caribbean at worlds end,300000000,139.082615,961000000,169.0,6.9,4500,2008.0,0.0,2.0,0.0,0.0
2,206647,spectre,245000000,107.376788,880674609,148.0,6.3,4466,2016.0,1.0,1.0,1.0,1.0
