In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
df = pd.read_csv('movies_cln.csv', parse_dates=['release_date'])

In [3]:
cr = pd.read_csv('credits.csv')

In [4]:
cr.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


### Checking the duplicates

In [5]:
cr[cr.duplicated(subset='id', keep=False)].sort_values(by='id')

Unnamed: 0,cast,crew,id
25885,"[{'cast_id': 12, 'character': 'The Creature', ...","[{'credit_id': '52fe4380c3a36847f80590dd', 'de...",3057
25950,"[{'cast_id': 12, 'character': 'The Creature', ...","[{'credit_id': '52fe4380c3a36847f80590dd', 'de...",3057
33838,"[{'cast_id': 15, 'character': 'Chuck Barris', ...","[{'credit_id': '52fe43e2c3a36847f80760a9', 'de...",4912
5865,"[{'cast_id': 15, 'character': 'Chuck Barris', ...","[{'credit_id': '52fe43e2c3a36847f80760b5', 'de...",4912
9165,"[{'cast_id': 11, 'character': 'Jef Costello', ...","[{'credit_id': '52fe440ac3a36847f807ee01', 'de...",5511
...,...,...,...
25887,"[{'cast_id': 7, 'character': 'Hollander', 'cre...","[{'credit_id': '52fe4da29251416c9111ce5d', 'de...",199591
24163,"[{'cast_id': 2, 'character': 'Ebba', 'credit_i...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...",265189
45275,"[{'cast_id': 2, 'character': 'Ebba', 'credit_i...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...",265189
33196,"[{'cast_id': 1, 'character': 'Jenjira', 'credi...","[{'credit_id': '5448c8efc3a3680fb4001582', 'de...",298721


In [6]:
cr.drop_duplicates(subset='id', inplace=True)

### Checking the intersection on the subset id from cr and df dataframe

In [7]:
cr[cr['id'].isin(df['id'])].shape

(45425, 3)

In [8]:
cr[~cr['id'].isin(df['id'])].shape

(7, 3)

Apparently there are 7 rows that are present in cr dataframe and not in df dataframe

In [9]:
df[df['id'].isin(cr['id'])].shape

(45425, 19)

In [10]:
df[~df['id'].isin(cr['id'])].shape

(1, 19)

So there are only 1 row from df dataframe that is not in cr dataframe

### Merging the df and cr Dataframe

In [11]:
df = df.merge(cr, how='left', on='id')

In [12]:
df.head(2)

Unnamed: 0,belongs_to_collection,budget_mio,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,...,revenue_mio,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,Toy Story Collection,30.0,"Animation,Comedy,Family",862.0,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,http://image.tmdb.org/t/p/w185//rhIRbceoE9lR4v...,Pixar Animation Studios,United States of America,...,373.554033,81.0,English,Released,,Toy Story,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,,65.0,"Adventure,Fantasy,Family",8844.0,en,When siblings Judy and Peter discover an encha...,17.015539,http://image.tmdb.org/t/p/w185//vzmL6fP7aPKNKP...,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,...,262.797249,104.0,"English,Français",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."


### Handling the 'cast' column

In [13]:
df['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

We have the stringified json format problem again

In [14]:
df['cast'] = df['cast'].apply(lambda x:ast.literal_eval(x) if isinstance(x,str) else x)

In [15]:
df['cast'][0][0]

{'cast_id': 14,
 'character': 'Woody (voice)',
 'credit_id': '52fe4284c3a36847f8024f95',
 'gender': 2,
 'id': 31,
 'name': 'Tom Hanks',
 'order': 0,
 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}

We only need to extract the 'name' only 

In [16]:
len(df['cast'][0])

13

In [17]:
df['cast_size'] = df['cast'].apply(lambda x:len(x) if isinstance(x,list) else np.NaN)

In [18]:
df['cast'] =df['cast'].apply(lambda x: ','.join(i['name'] for i in x) if isinstance(x,list) else x)

In [19]:
df.head(2)

Unnamed: 0,belongs_to_collection,budget_mio,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,cast_size
0,Toy Story Collection,30.0,"Animation,Comedy,Family",862.0,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,http://image.tmdb.org/t/p/w185//rhIRbceoE9lR4v...,Pixar Animation Studios,United States of America,...,81.0,English,Released,,Toy Story,7.7,5415.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",13.0
1,,65.0,"Adventure,Fantasy,Family",8844.0,en,When siblings Judy and Peter discover an encha...,17.015539,http://image.tmdb.org/t/p/w185//vzmL6fP7aPKNKP...,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,...,104.0,"English,Français",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",26.0


In [20]:
df['cast_size'].value_counts()

10.0     2802
8.0      2766
7.0      2749
6.0      2691
5.0      2683
         ... 
183.0       1
165.0       1
151.0       1
130.0       1
109.0       1
Name: cast_size, Length: 151, dtype: int64

### Handling the 'crew' column

In [21]:
df['crew'] = df['crew'].apply(lambda x:ast.literal_eval(x) if isinstance(x,str) else x)

In [22]:
df['crew_size'] = df['crew'].apply(lambda x:len(x) if isinstance(x,list) else np.NaN)

For the crew we need only the director name

In [23]:
df['director'] = df['crew'].apply(lambda x:''.join([i['name'] for i in x if i['job']=='Director']) if isinstance(x,list) else np.nan)

In [24]:
df.loc[df['director']=='', 'director'] = np.nan

In [25]:
df.head(2)

Unnamed: 0,belongs_to_collection,budget_mio,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,...,status,tagline,title,vote_average,vote_count,cast,crew,cast_size,crew_size,director
0,Toy Story Collection,30.0,"Animation,Comedy,Family",862.0,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,http://image.tmdb.org/t/p/w185//rhIRbceoE9lR4v...,Pixar Animation Studios,United States of America,...,Released,,Toy Story,7.7,5415.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",13.0,106.0,John Lasseter
1,,65.0,"Adventure,Fantasy,Family",8844.0,en,When siblings Judy and Peter discover an encha...,17.015539,http://image.tmdb.org/t/p/w185//vzmL6fP7aPKNKP...,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,...,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",26.0,16.0,Joe Johnston


We can drop the 'crew' column

In [26]:
df = df.drop(columns=['crew'], axis=1)

In [27]:
df.to_csv('movies_final.csv', index=False)