In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 

In [9]:
df = pd.read_csv('../Data/movie_details.csv')
df.head()

Unnamed: 0,index,title,release_date,year_num,holiday_season,studio,australian_film,rated,adaptation,franchise,budget_mil,genre_list
0,1,Incredibles 2,6/14/18,2018_24,0,Disney,0,PG,0,1,200,"Animation, Action, Adventure, Comedy, Family, ..."
1,2,Beauty and the Beast,3/23/17,2017_12,1,Disney,0,PG,1,0,160,"Family, Fantasy, Musical, Romance"
2,3,The Greatest Showman,12/26/17,2017_52,1,Fox,0,PG,0,0,84,"Biography, Drama, Musical"
3,4,Black Panther,2/15/18,2018_07,0,Disney,0,PG-13,1,0,202,"Action, Adventure, Sci-Fi"
4,5,Avengers Infinity War,4/25/18,2018_17,1,Disney,0,PG-13,0,1,316,"Action, Adventure, Sci-Fi"


### Holidays in Australia (all states only)

- New Year Day: 1-Jan
- Australia Day: 6-Jan
- Labor Day: 9-Mar
- Good Friday: 10-Apr
- Easter Monday: 13-Apr
- Anzac Day: 25-Apr
- Queens Birthday: 8-Jun
- Christmas Day: 25-Dec
- Boxing Day: 26-Dec

Any movies released 21 days before any of the public holiay labelled as 1 in holiday_season

In [3]:
studio_dummies = pd.get_dummies(df.studio)
df_clean = pd.concat([df, studio_dummies], axis=1)

df_clean.head()

Unnamed: 0,index,title,release_date,year_num,holiday_season,studio,australian_film,rated,adaptation,franchise,...,Bunya,Disney,Fox,PPI,Pinacle,Sony,Trans,UPI,Umbrella,WB
0,1,Incredibles 2,6/14/18,2018_24,0,Disney,0,PG,0,1,...,0,1,0,0,0,0,0,0,0,0
1,2,Beauty and the Beast,3/23/17,2017_12,1,Disney,0,PG,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,The Greatest Showman,12/26/17,2017_52,1,Fox,0,PG,0,0,...,0,0,1,0,0,0,0,0,0,0
3,4,Black Panther,2/15/18,2018_07,0,Disney,0,PG-13,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,Avengers Infinity War,4/25/18,2018_17,1,Disney,0,PG-13,0,1,...,0,1,0,0,0,0,0,0,0,0


In [4]:
rated_dummies = pd.get_dummies(df_clean.rated)
df_clean = pd.concat([df_clean, rated_dummies], axis=1)
df_clean.head()

Unnamed: 0,index,title,release_date,year_num,holiday_season,studio,australian_film,rated,adaptation,franchise,...,PPI,Pinacle,Sony,Trans,UPI,Umbrella,WB,PG,PG-13,R
0,1,Incredibles 2,6/14/18,2018_24,0,Disney,0,PG,0,1,...,0,0,0,0,0,0,0,1,0,0
1,2,Beauty and the Beast,3/23/17,2017_12,1,Disney,0,PG,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,The Greatest Showman,12/26/17,2017_52,1,Fox,0,PG,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Black Panther,2/15/18,2018_07,0,Disney,0,PG-13,1,0,...,0,0,0,0,0,0,0,0,1,0
4,5,Avengers Infinity War,4/25/18,2018_17,1,Disney,0,PG-13,0,1,...,0,0,0,0,0,0,0,0,1,0


In [5]:
def encode_list(df, col):
    """
    PARAM: 
    |df: dataframe to modify
    |col: title of column that contains a string of categorical variables. Eg a movie belonging to multiple genres. 
    
    OUTPUT: updated df
    """
    #Get list of unique genres
    lst = []
    [lst.append(list(x.strip() for x in my_string.split(','))) for my_string in df[col].unique()]
    flat_lst = [item for sublist in lst for item in sublist]
    flat_lst = list(set(flat_lst)) 
    
    #add empty columns first
    df = pd.concat([df,pd.DataFrame(columns=flat_lst)], sort=False)
    df.fillna(0, inplace=True)
    
    #update columns 
    def update(row):
        genre_lst = row['genre_list']
        for x in genre_lst.split(','):
            genre = x.strip()
            row[genre] = 1 
        return row

    df = df.apply(lambda row: update(row) ,axis=1) 
    
    return df
    

df = encode_list(df_clean, 'genre_list')

In [6]:
df.to_csv('../Data/new_movie_details.csv')

In [7]:
df.head()

Unnamed: 0,index,title,release_date,year_num,holiday_season,studio,australian_film,rated,adaptation,franchise,...,Western,Adventure,Crime,History,Mystery,Music,Family,Musical,Animation,Thriller
0,1.0,Incredibles 2,6/14/18,2018_24,0.0,Disney,0.0,PG,0.0,1.0,...,0,1,0,0,0,0,1,0,1,0
1,2.0,Beauty and the Beast,3/23/17,2017_12,1.0,Disney,0.0,PG,1.0,0.0,...,0,0,0,0,0,0,1,1,0,0
2,3.0,The Greatest Showman,12/26/17,2017_52,1.0,Fox,0.0,PG,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,4.0,Black Panther,2/15/18,2018_07,0.0,Disney,0.0,PG-13,1.0,0.0,...,0,1,0,0,0,0,0,0,0,0
4,5.0,Avengers Infinity War,4/25/18,2018_17,1.0,Disney,0.0,PG-13,0.0,1.0,...,0,1,0,0,0,0,0,0,0,0
