## Imports ##

In [1]:
import ast

import numpy as np
import pandas as pd

## Train Data ##

In [2]:
train_csv = pd.read_csv("finegrained_poster_train_data.csv").drop("Unnamed: 0", axis=1)
train_csv

Unnamed: 0,movie,genre
0,100earthquake,['disaster']
1,2012,"['disaster', 'scifi']"
2,advcom-rio,"['adventure', 'comedy']"
3,adventuresinzambezia,['adventure']
4,alitabattleangel,"['action', 'scifi']"
...,...,...
83,thespies,"['comedy', 'spy']"
84,truememoirsofaninternationalassassin,"['action', 'comedy']"
85,underwaterbloodwars,"['action', 'horror']"
86,vampireacademy,"['comedy', 'horror']"


In [3]:
train_csv['genre'] = train_csv['genre'].apply(lambda x: ast.literal_eval(str(x)))
all_genres = train_csv['genre'].apply(pd.Series).stack().unique()
all_genres

array(['disaster', 'scifi', 'adventure', 'comedy', 'action', 'horror',
       'romance', 'spy', 'martialarts'], dtype=object)

In [4]:
train_csv['genre'] = train_csv['genre'].apply(lambda x: [1 if g in x else 0 for g in all_genres])
train_csv

Unnamed: 0,movie,genre
0,100earthquake,"[1, 0, 0, 0, 0, 0, 0, 0, 0]"
1,2012,"[1, 1, 0, 0, 0, 0, 0, 0, 0]"
2,advcom-rio,"[0, 0, 1, 1, 0, 0, 0, 0, 0]"
3,adventuresinzambezia,"[0, 0, 1, 0, 0, 0, 0, 0, 0]"
4,alitabattleangel,"[0, 1, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
83,thespies,"[0, 0, 0, 1, 0, 0, 0, 1, 0]"
84,truememoirsofaninternationalassassin,"[0, 0, 0, 1, 1, 0, 0, 0, 0]"
85,underwaterbloodwars,"[0, 0, 0, 0, 1, 1, 0, 0, 0]"
86,vampireacademy,"[0, 0, 0, 1, 0, 1, 0, 0, 0]"


In [5]:
train_csv[all_genres] = pd.DataFrame(train_csv['genre'].tolist(), index=train_csv.index)
train_csv['genre'] = train_csv['genre'].apply(lambda x: [all_genres[i] for i in range(len(x)) if x[i]])
train_csv

Unnamed: 0,movie,genre,disaster,scifi,adventure,comedy,action,horror,romance,spy,martialarts
0,100earthquake,[disaster],1,0,0,0,0,0,0,0,0
1,2012,"[disaster, scifi]",1,1,0,0,0,0,0,0,0
2,advcom-rio,"[adventure, comedy]",0,0,1,1,0,0,0,0,0
3,adventuresinzambezia,[adventure],0,0,1,0,0,0,0,0,0
4,alitabattleangel,"[scifi, action]",0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
83,thespies,"[comedy, spy]",0,0,0,1,0,0,0,1,0
84,truememoirsofaninternationalassassin,"[comedy, action]",0,0,0,1,1,0,0,0,0
85,underwaterbloodwars,"[action, horror]",0,0,0,0,1,1,0,0,0
86,vampireacademy,"[comedy, horror]",0,0,0,1,0,1,0,0,0


## Test Data ##

In [6]:
test_csv = pd.read_csv("finegrained_poster_test_data.csv").drop("Unnamed: 0", axis=1)
test_csv

Unnamed: 0,movie,genre
0,jumanjiwelcometothejungle,"['adventure', 'comedy']"
1,paul,"['comedy', 'scifi']"
2,shaolinsoccer,"['comedy', 'martialarts']"
3,spectre,"['action', 'spy']"
4,sputnik,"['horror', 'scifi']"
5,thecore,"['disaster', 'scifi']"
6,thediscovery,"['romance', 'scifi']"
7,theedgeofseventeen,['comedy']
8,thefastandthefurioustokyodrift,['action']
9,thefinalmaster,"['action', 'martialarts']"


In [7]:
test_csv['genre'] = test_csv['genre'].apply(lambda x: [1 if g in x else 0 for g in all_genres])
test_csv

Unnamed: 0,movie,genre
0,jumanjiwelcometothejungle,"[0, 0, 1, 1, 0, 0, 0, 0, 0]"
1,paul,"[0, 1, 0, 1, 0, 0, 0, 0, 0]"
2,shaolinsoccer,"[0, 0, 0, 1, 0, 0, 0, 0, 1]"
3,spectre,"[0, 0, 0, 0, 1, 0, 0, 1, 0]"
4,sputnik,"[0, 1, 0, 0, 0, 1, 0, 0, 0]"
5,thecore,"[1, 1, 0, 0, 0, 0, 0, 0, 0]"
6,thediscovery,"[0, 1, 0, 0, 0, 0, 1, 0, 0]"
7,theedgeofseventeen,"[0, 0, 0, 1, 0, 0, 0, 0, 0]"
8,thefastandthefurioustokyodrift,"[0, 0, 0, 0, 1, 0, 0, 0, 0]"
9,thefinalmaster,"[0, 0, 0, 0, 1, 0, 0, 0, 1]"


In [8]:
test_csv[all_genres] = pd.DataFrame(test_csv['genre'].tolist(), index=test_csv.index)
test_csv['genre'] = test_csv['genre'].apply(lambda x: [all_genres[i] for i in range(len(x)) if x[i]])
test_csv

Unnamed: 0,movie,genre,disaster,scifi,adventure,comedy,action,horror,romance,spy,martialarts
0,jumanjiwelcometothejungle,"[adventure, comedy]",0,0,1,1,0,0,0,0,0
1,paul,"[scifi, comedy]",0,1,0,1,0,0,0,0,0
2,shaolinsoccer,"[comedy, martialarts]",0,0,0,1,0,0,0,0,1
3,spectre,"[action, spy]",0,0,0,0,1,0,0,1,0
4,sputnik,"[scifi, horror]",0,1,0,0,0,1,0,0,0
5,thecore,"[disaster, scifi]",1,1,0,0,0,0,0,0,0
6,thediscovery,"[scifi, romance]",0,1,0,0,0,0,1,0,0
7,theedgeofseventeen,[comedy],0,0,0,1,0,0,0,0,0
8,thefastandthefurioustokyodrift,[action],0,0,0,0,1,0,0,0,0
9,thefinalmaster,"[action, martialarts]",0,0,0,0,1,0,0,0,1


## Saving ##

In [9]:
train_csv.to_csv("finegrained_poster_train_data_multihotencoded.csv")
test_csv.to_csv("finegrained_poster_test_data_multihotencoded.csv")