In [None]:
import pandas as pd
import ast
import numpy as np

In [None]:
curr_dir = ""

Load data from files

In [None]:
movie_list = pd.read_csv(curr_dir + 'Movie_list.csv',delimiter = ',',quotechar = '"')
movie_details = pd.read_csv(curr_dir + 'movie_details.csv',delimiter = ',',quotechar = '"').dropna(how='all').reset_index()
cast_details = pd.read_csv(curr_dir + 'cast_details.csv',delimiter = ',',quotechar = '"').dropna(how='all').reset_index()
crew_details = pd.read_csv(curr_dir + 'crew_details.csv',delimiter = ',',quotechar = '"',header=None,names=['adult', 'gender', 'id', 'known_for_department', 'name','original_name', 'popularity', 'profile_path','credit_id','department','job', 'movie_id','order']).dropna(how='all').reset_index()
collection_details = pd.read_csv(curr_dir + 'Collection_details.csv',delimiter = ',',quotechar = '"').dropna(how='all').reset_index()
genre_list = pd.read_csv(curr_dir + 'data_genre.csv',delimiter = ',',quotechar = '"').dropna(how='all').reset_index()

In [None]:
#Removing records with no target label
movie_details.drop(index=movie_details[movie_details.revenue == 0].index,inplace=True)
movie_details = movie_details.reset_index()

Feature Selection

In [None]:
#Keeping only relevant columns from each file
features_movie_details = movie_details[['adult','belongs_to_collection','budget','genres','id','production_countries','release_date','revenue','runtime','original_language']]
features_cast_details = cast_details[['gender','known_for_department','popularity','movie_id','order']]
features_crew_details = crew_details[['gender','known_for_department','popularity','department','movie_id','order']]
features_collection_details = collection_details[['id','parts']]

Data Processing and Feature Creation

In [None]:
#Creating Collection popularity field by taking the average of popularities of all movies in a collection
temp_df = pd.DataFrame(columns = ['collection_popularity'])
index = 0
for parts in collection_details['parts']:
    temp = ast.literal_eval(parts)
    sum_popularity = 0
    parts = 0
    for part in temp:
        parts += 1
        sum_popularity += part['popularity']
    temp_df.loc[index] =  sum_popularity/parts
    index += 1

features_collection_details['collection_popularity'] = temp_df['collection_popularity']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
#One hot encoding of genre field in movie details file.
arr = np.zeros((7967,19))
temp_fs_df = pd.DataFrame(arr,columns = (genre_list.name.unique())).add_suffix('_genre')
for index,genres in enumerate(features_movie_details.genres):
  try:
    temp_genres = ast.literal_eval(genres)
    for temp_genre in temp_genres:
        attr = temp_genre['name'] + '_genre'
        temp_fs_df[attr].loc[index] = 1
        #print(temp_fs_df[temp_genre['name']].loc[index])
  except:
    print(genres)

features_movie_details = pd.concat([features_movie_details,temp_fs_df],axis = 1,join='inner')

In [None]:
#One hot encoding of gender field in cast details file
arr = np.zeros((201491,4))
temp_fs_df = pd.DataFrame(arr,columns = (features_cast_details.gender.unique())).add_suffix('_gender')
for index,gender in enumerate(features_cast_details.gender):
    attr = str((gender)) + '_gender'
    temp_fs_df[attr].loc[index] = 1

features_cast_details = pd.concat([features_cast_details,temp_fs_df],axis = 1,join='inner')

In [None]:
#One hot encoding of gender field in crew details file
arr = np.zeros((49122,4))
temp_fs_df = pd.DataFrame(arr,columns = (features_crew_details.gender.unique())).add_suffix('_gender')
for index,gender in enumerate(features_crew_details.gender):
    attr = str(int(gender)) + '_gender'
    temp_fs_df[attr].loc[index] = 1

features_crew_details = pd.concat([features_crew_details,temp_fs_df],axis = 1,join='inner')
features_cast_details.rename(columns={'0.0_gender':'0_gender','1.0_gender':'1_gender','2.0_gender':'2_gender','3.0_gender':'3_gender'},inplace= True)

In [None]:
#Updating belongs_to_collection field with only the id of collection and creating field Part_of_series
features_movie_details['Part_of_series'] = 0
for index,collection_info in enumerate(features_movie_details.belongs_to_collection):
    if (collection_info == collection_info):
        temp_info = ast.literal_eval(collection_info)
        features_movie_details['belongs_to_collection'].loc[index] = temp_info['id']
        features_movie_details['Part_of_series'].loc[index] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


SyntaxError: ignored

In [None]:
#Due to large number of production countries, this field was left out of the analysis
production_countries = []
for index,production_country in enumerate(movie_details.production_countries):
    temp_pcs = ast.literal_eval(production_country)
    for temp_pc in temp_pcs:
        if (temp_pc['name'] in production_countries):
            continue
        else:
            production_countries.append(temp_pc['name'])

len(production_countries)

112

In [None]:
#Creating fields Acting relevance in cast details and directing relevance in crew details
features_cast_details['Acting_relevance'] = 1
features_cast_details[features_cast_details.known_for_department != 'Acting'] = 0

features_crew_details['Directing_relevance'] = 1
features_crew_details[features_crew_details.known_for_department != 'Directing'] = 0

Merge Datasets and create final dataset

In [None]:
#Adding suffix to column headers to avoid confusion after joins
features_movie_details = features_movie_details.add_suffix('_main')
features_cast_details = features_cast_details.add_suffix('_cast')
features_crew_details = features_crew_details.add_suffix('_crew')
features_collection_details = features_collection_details.add_suffix('_coll_details')

In [None]:
#Removing duplicates from collection details file to give unique collections
features_collection_details.drop_duplicates(subset=['id_coll_details'],keep='first',inplace= True)

In [None]:
#Joining movie details with collection details
overall_Data = pd.merge(features_movie_details,features_collection_details, left_on = 'belongs_to_collection_main',right_on = "id_coll_details",how="left",validate="many_to_one")

#Joining new dataframe with crew details. This was done in three joins to get the all the crew for a movie in the same record
overall_Data = pd.merge(overall_Data, features_crew_details[features_crew_details.order_crew == 1.0],left_on = 'id_main',right_on = "movie_id_crew", suffixes = ('','_1'),how="left")
overall_Data = pd.merge(overall_Data, features_crew_details[features_crew_details.order_crew == 2.0],left_on = 'id_main',right_on = "movie_id_crew", suffixes = ('','_2'),how="left")
overall_Data = pd.merge(overall_Data, features_crew_details[features_crew_details.order_crew == 3.0],left_on = 'id_main',right_on = "movie_id_crew", suffixes = ('','_3'),how="left")

#Joining new dataframe with cast details. This was done in five joins to get the all the cast for a movie in the same record
overall_Data = pd.merge(overall_Data, features_cast_details[features_cast_details.order_cast == 1.0],left_on = 'id_main', right_on = 'movie_id_cast', suffixes = ('','_1'),how="left")
overall_Data = pd.merge(overall_Data, features_cast_details[features_cast_details.order_cast == 2.0],left_on = 'id_main', right_on = 'movie_id_cast', suffixes = ('','_2'),how="left")
overall_Data = pd.merge(overall_Data, features_cast_details[features_cast_details.order_cast == 3.0],left_on = 'id_main', right_on = 'movie_id_cast', suffixes = ('','_3'),how="left")
overall_Data = pd.merge(overall_Data, features_cast_details[features_cast_details.order_cast == 4.0],left_on = 'id_main', right_on = 'movie_id_cast', suffixes = ('','_4'),how="left")
overall_Data = pd.merge(overall_Data, features_cast_details[features_cast_details.order_cast == 5.0],left_on = 'id_main', right_on = 'movie_id_cast', suffixes = ('','_5'),how="left")

In [None]:
#Cleaning the joined dataset and creating final dataset for export
overall_Data.drop_duplicates(subset=['id_main'],keep='first',inplace= True)
final_data = overall_Data[['adult_main','budget_main','id_main','release_date_main','revenue_main','runtime_main','original_language_main','Action_genre_main','Adventure_genre_main','Animation_genre_main','Comedy_genre_main','Crime_genre_main','Documentary_genre_main','Drama_genre_main','Family_genre_main','Fantasy_genre_main','History_genre_main','Horror_genre_main','Music_genre_main','Mystery_genre_main','Romance_genre_main','Science Fiction_genre_main','TV Movie_genre_main','Thriller_genre_main','War_genre_main','Western_genre_main','collection_popularity_coll_details','popularity_crew','0_gender_crew','1_gender_crew','2_gender_crew','3_gender_crew','Directing_relevance_crew','popularity_crew_2','0_gender_crew_2','1_gender_crew_2','2_gender_crew_2','3_gender_crew_2','Directing_relevance_crew_2','popularity_crew_3','0_gender_crew_3','1_gender_crew_3','2_gender_crew_3','3_gender_crew_3','Directing_relevance_crew_3','popularity_cast','2_gender_cast','1_gender_cast','0_gender_cast','3_gender_cast','Acting_relevance_cast','popularity_cast_2','2_gender_cast_2','1_gender_cast_2','0_gender_cast_2','3_gender_cast_2','Acting_relevance_cast_2','popularity_cast_3','2_gender_cast_3','1_gender_cast_3','0_gender_cast_3','3_gender_cast_3','Acting_relevance_cast_3','popularity_cast_4','2_gender_cast_4','1_gender_cast_4','0_gender_cast_4','3_gender_cast_4','Acting_relevance_cast_4','popularity_cast_5','2_gender_cast_5','1_gender_cast_5','0_gender_cast_5','3_gender_cast_5','Acting_relevance_cast_5']]
final_data.dropna(how = 'all',subset=['popularity_crew','0_gender_crew','1_gender_crew','2_gender_crew','3_gender_crew','Directing_relevance_crew','popularity_crew_2','0_gender_crew_2','1_gender_crew_2','2_gender_crew_2','3_gender_crew_2','Directing_relevance_crew_2','popularity_crew_3','0_gender_crew_3','1_gender_crew_3','2_gender_crew_3','3_gender_crew_3','Directing_relevance_crew_3','popularity_cast','2_gender_cast','1_gender_cast','0_gender_cast','3_gender_cast','Acting_relevance_cast','popularity_cast_2','2_gender_cast_2','1_gender_cast_2','0_gender_cast_2','3_gender_cast_2','Acting_relevance_cast_2','popularity_cast_3','2_gender_cast_3','1_gender_cast_3','0_gender_cast_3','3_gender_cast_3','Acting_relevance_cast_3','popularity_cast_4','2_gender_cast_4','1_gender_cast_4','0_gender_cast_4','3_gender_cast_4','Acting_relevance_cast_4','popularity_cast_5','2_gender_cast_5','1_gender_cast_5','0_gender_cast_5','3_gender_cast_5','Acting_relevance_cast_5'],inplace=True)
final_data.set_index('id_main')

#for all numeric fields, replacing nan with 0s
for str in ['popularity_crew','0_gender_crew','1_gender_crew','2_gender_crew','3_gender_crew','Directing_relevance_crew','popularity_crew_2','0_gender_crew_2','1_gender_crew_2','2_gender_crew_2','3_gender_crew_2','Directing_relevance_crew_2','popularity_crew_3','0_gender_crew_3','1_gender_crew_3','2_gender_crew_3','3_gender_crew_3','Directing_relevance_crew_3','popularity_cast','2_gender_cast','1_gender_cast','0_gender_cast','3_gender_cast','Acting_relevance_cast','popularity_cast_2','2_gender_cast_2','1_gender_cast_2','0_gender_cast_2','3_gender_cast_2','Acting_relevance_cast_2','popularity_cast_3','2_gender_cast_3','1_gender_cast_3','0_gender_cast_3','3_gender_cast_3','Acting_relevance_cast_3','popularity_cast_4','2_gender_cast_4','1_gender_cast_4','0_gender_cast_4','3_gender_cast_4','Acting_relevance_cast_4','popularity_cast_5','2_gender_cast_5','1_gender_cast_5','0_gender_cast_5','3_gender_cast_5','Acting_relevance_cast_5']:
    final_data[str].replace(np.nan,0,inplace=True)

#Dropping records for which target variable was not available
final_data = final_data[final_data.revenue_main != 0]

#Replacing 0 values of budget with mean of budget
final_data[final_data.budget_main == 0]['budget_main'] = final_data['budget_main'].mean()

In [None]:
#Exporting final dataset for model training
final_data = final_data.reindex(columns=['revenue_main','adult_main', 'budget_main', 'id_main', 'release_date_main',
       'runtime_main', 'original_language_main',
       'Action_genre_main', 'Adventure_genre_main', 'Animation_genre_main',
       'Comedy_genre_main', 'Crime_genre_main', 'Documentary_genre_main',
       'Drama_genre_main', 'Family_genre_main', 'Fantasy_genre_main',
       'History_genre_main', 'Horror_genre_main', 'Music_genre_main',
       'Mystery_genre_main', 'Romance_genre_main',
       'Science Fiction_genre_main', 'TV Movie_genre_main',
       'Thriller_genre_main', 'War_genre_main', 'Western_genre_main',
       'collection_popularity_coll_details', 'popularity_crew',
       '0_gender_crew', '1_gender_crew', '2_gender_crew', '3_gender_crew',
       'Directing_relevance_crew', 'popularity_crew_2', '0_gender_crew_2',
       '1_gender_crew_2', '2_gender_crew_2', '3_gender_crew_2',
       'Directing_relevance_crew_2', 'popularity_crew_3', '0_gender_crew_3',
       '1_gender_crew_3', '2_gender_crew_3', '3_gender_crew_3',
       'Directing_relevance_crew_3', 'popularity_cast', '2_gender_cast',
       '1_gender_cast', '0_gender_cast', '3_gender_cast',
       'Acting_relevance_cast', 'popularity_cast_2', '2_gender_cast_2',
       '1_gender_cast_2', '0_gender_cast_2', '3_gender_cast_2',
       'Acting_relevance_cast_2', 'popularity_cast_3', '2_gender_cast_3',
       '1_gender_cast_3', '0_gender_cast_3', '3_gender_cast_3',
       'Acting_relevance_cast_3', 'popularity_cast_4', '2_gender_cast_4',
       '1_gender_cast_4', '0_gender_cast_4', '3_gender_cast_4',
       'Acting_relevance_cast_4', 'popularity_cast_5', '2_gender_cast_5',
       '1_gender_cast_5', '0_gender_cast_5', '3_gender_cast_5',
       'Acting_relevance_cast_5'])

final_data.drop(labels=['original_language_main'],axis=1,inplace=True)
final_data.to_csv(curr_dir + 'final_data.csv')