In [50]:
import numpy as np
import pandas as pd
import sklearn
import datetime
import os
import json

In [95]:
raw_data = pd.read_csv('training.csv')
validation_data = pd.read_csv('validation.csv')

In [96]:
raw_data.info()
print("\n")
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              2100 non-null   int64  
 1   cast                  2100 non-null   object 
 2   crew                  2100 non-null   object 
 3   budget                2100 non-null   int64  
 4   genres                2100 non-null   object 
 5   homepage              955 non-null    object 
 6   keywords              2100 non-null   object 
 7   original_language     2100 non-null   object 
 8   original_title        2100 non-null   object 
 9   overview              2100 non-null   object 
 10  production_companies  2100 non-null   object 
 11  production_countries  2100 non-null   object 
 12  release_date          2100 non-null   object 
 13  revenue               2100 non-null   int64  
 14  runtime               2100 non-null   float64
 15  spoken_languages     

In [88]:
# # helper functions
def get_json_tokens(x, tag):
    json_data = json.loads(x)
    list_genres = []
    for token in json_data:
        list_genres.append(token[tag])
    return list_genres

def one_hot_encoding(column):
    dummies = pd.get_dummies(column.apply(pd.Series).stack()).sum(level=0) #one hot encoding
    list_of_difference = df_dataset.index.difference(dummies.index)
    if len(list_of_difference) != 0:
        add_df = pd.DataFrame(index=list(list_of_difference), columns=dummies.columns).fillna(0)
        dummies = pd.concat([dummies, add_df])
        df = dummies.reset_index()
        df.sort_values(['index'],inplace=True)
        df.set_index(['index'], inplace=True)
        return df
    else:
        return dummies
    
def get_spoken_language(x):
    json_data = json.loads(x)
    list_genres = []
    for token in json_data:
        list_genres.append(str(str("spoken_")+token['iso_639_1']))
    return list_genres



In [54]:
df_dataset = pd.DataFrame(columns=['movie_id'])
df_dataset['movie_id'] = raw_data['movie_id']
df_dataset['budget'] = raw_data['budget']
df_dataset['genres'] = raw_data['genres'].apply(lambda row: get_json_tokens(row, 'name'))

In [57]:
genres_dummies = one_hot_encoding(df_dataset['genres'])
df_dataset = pd.concat([df_dataset, genres_dummies],axis=1, sort=False) #merging two data frame

In [58]:
df_dataset['has_homepage'] = 0
df_dataset.loc[raw_data['homepage'].isnull() == False, 'has_homepage'] = 1 #1 here means it has home page

In [60]:
original_language_dummies = one_hot_encoding(raw_data["original_language"])
df_dataset = pd.concat([df_dataset, original_language_dummies],axis=1, sort=False) #merging two data frame

In [61]:
df_dataset

Unnamed: 0,movie_id,budget,genres,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,en,es,fr,it,ja,ko,nl,ru,te,zh
0,19995,237000000,"[Action, Adventure, Fantasy, Science Fiction]",1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,285,300000000,"[Adventure, Fantasy, Action]",1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,206647,245000000,"[Action, Adventure, Crime]",1,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,49026,250000000,"[Action, Crime, Drama, Thriller]",1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
4,49529,260000000,"[Action, Adventure, Science Fiction]",1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,9594,15000000,"[Thriller, Action, Crime, Drama]",1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
2096,4638,12000000,"[Crime, Action, Comedy]",1,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2097,13972,16000000,"[Comedy, Drama, Romance]",0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2098,5038,15000000,"[Drama, Romance]",0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [62]:
df_dataset['production_companies'] = raw_data['production_companies'].apply(lambda row: get_json_tokens(row, 'name'))

In [33]:
list_top_companies = df_dataset['production_companies'].apply(pd.Series).stack().value_counts().head(10).index

In [66]:
top_companies_dummies = one_hot_encoding(df_dataset['production_companies'])
top_companies_dummies = top_companies_dummies[list(list_top_companies)]
df_dataset = pd.concat([df_dataset, top_companies_dummies],axis=1, sort=False) #merging two data frame

In [68]:
df_dataset['production_countries'] = raw_data['production_countries'].apply(lambda row: get_json_tokens(row, 'name'))

0       [United States of America, United Kingdom]
1                       [United States of America]
2       [United Kingdom, United States of America]
3                       [United States of America]
4                       [United States of America]
                           ...                    
2095                    [United States of America]
2096                              [United Kingdom]
2097                    [United States of America]
2098             [United States of America, Spain]
2099                    [United States of America]
Name: production_countries, Length: 2100, dtype: object

In [69]:
list_top_countries = df_dataset['production_countries'].apply(pd.Series).stack().value_counts().head(15).index

In [78]:
countries_dummies = one_hot_encoding(df_dataset['production_countries'])
countries_dummies = countries_dummies[list(list_top_countries)]
df_dataset = pd.concat([df_dataset, countries_dummies],axis=1, sort=False) #merging two data frame

In [81]:
df_dataset['release_date'] = raw_data['release_date']
df_dataset['release_date'] = pd.to_datetime(df_dataset['release_date'])

In [84]:
date_parts = ["year", "weekday", "month", 'weekofyear', 'day', 'quarter']
for part in date_parts:
    part_col = 'release_date' + "_" + part #add prefix as  "release_date" before the columne
    df_dataset[part_col] = getattr(df_dataset['release_date'].dt, part).astype(int)

  df_dataset[part_col] = getattr(df_dataset['release_date'].dt, part).astype(int)


In [85]:
df_dataset

Unnamed: 0,movie_id,budget,genres,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,Czech Republic,India,Ireland,release_date,release_date_year,release_date_weekday,release_date_month,release_date_weekofyear,release_date_day,release_date_quarter
0,19995,237000000,"[Action, Adventure, Fantasy, Science Fiction]",1,1,0,0,0,0,0,...,0,0,0,2009-12-10,2009,3,12,50,10,4
1,285,300000000,"[Adventure, Fantasy, Action]",1,1,0,0,0,0,0,...,0,0,0,2007-05-19,2007,5,5,20,19,2
2,206647,245000000,"[Action, Adventure, Crime]",1,1,0,0,1,0,0,...,0,0,0,2015-10-26,2015,0,10,44,26,4
3,49026,250000000,"[Action, Crime, Drama, Thriller]",1,0,0,0,1,0,1,...,0,0,0,2012-07-16,2012,0,7,29,16,3
4,49529,260000000,"[Action, Adventure, Science Fiction]",1,1,0,0,0,0,0,...,0,0,0,2012-03-07,2012,2,3,10,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,9594,15000000,"[Thriller, Action, Crime, Drama]",1,0,0,0,1,0,1,...,0,0,0,1991-08-10,1991,5,8,32,10,3
2096,4638,12000000,"[Crime, Action, Comedy]",1,0,0,1,1,0,0,...,0,0,0,2007-02-14,2007,2,2,7,14,1
2097,13972,16000000,"[Comedy, Drama, Romance]",0,0,0,1,0,0,1,...,0,0,0,2008-09-12,2008,4,9,37,12,3
2098,5038,15000000,"[Drama, Romance]",0,0,0,0,0,0,1,...,0,0,0,2008-08-15,2008,4,8,33,15,3


In [86]:
df_dataset['runtime'] = raw_data['runtime']

In [89]:
df_dataset['spoken_languages'] = raw_data['spoken_languages'].apply(lambda row: get_spoken_language(row))

In [91]:
list_top_spoken_language = df_dataset['spoken_languages'].apply(pd.Series).stack().value_counts().head(10).index

In [92]:
spoken_dummies = one_hot_encoding(df_dataset['spoken_languages'])
spoken_dummies = spoken_dummies[list(list_top_spoken_language)]
df_dataset = pd.concat([df_dataset, spoken_dummies],axis=1, sort=False) #merging two data frame

In [93]:
df_dataset

Unnamed: 0,movie_id,budget,genres,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,spoken_en,spoken_fr,spoken_es,spoken_de,spoken_ru,spoken_it,spoken_zh,spoken_ja,spoken_ar,spoken_pt
0,19995,237000000,"[Action, Adventure, Fantasy, Science Fiction]",1,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,285,300000000,"[Adventure, Fantasy, Action]",1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,206647,245000000,"[Action, Adventure, Crime]",1,1,0,0,1,0,0,...,1,1,1,1,0,1,0,0,0,0
3,49026,250000000,"[Action, Crime, Drama, Thriller]",1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
4,49529,260000000,"[Action, Adventure, Science Fiction]",1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,9594,15000000,"[Thriller, Action, Crime, Drama]",1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
2096,4638,12000000,"[Crime, Action, Comedy]",1,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2097,13972,16000000,"[Comedy, Drama, Romance]",0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2098,5038,15000000,"[Drama, Romance]",0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0


spoken_en    0
spoken_fr    0
spoken_es    0
spoken_de    0
spoken_ru    0
spoken_it    0
spoken_zh    0
spoken_ja    0
spoken_ar    0
spoken_pt    0
Name: 1086, dtype: object