In [1]:
# Basic import
import os
import sys
import json
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

# genresDic.json -> one-hot encoding

In [3]:
genresDic = read_json('genresDic.json')
len(genresDic)

166

In [4]:
columns = read_json('./orderdListGenres.json')
print(len(columns), columns)

20 ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [5]:
genresMat = {}
for acc, value in genresDic.items():
    gs = genresDic[acc]
    print(acc, gs)
    glist = []
    for g in columns:
        if g in gs:
            glist.append(1)
        else:
            glist.append(0)
            
    genresMat[acc] = glist

21bridgesmovie ['Action', 'Crime', 'Drama']
47metersdown ['Adventure', 'Drama', 'Horror']
abeautifuldaymovie ['Biography', 'Drama']
abominablemovie ['Adventure', 'Animation', 'Comedy']
adastramovie ['Adventure', 'Drama', 'Mystery']
adogsjourneymovie ['Adventure', 'Comedy', 'Drama']
aftermathmovie ['Drama', 'Romance', 'War']
aftermovie ['Drama', 'Romance']
ahiddenlifefilm ['Biography', 'Drama', 'War']
alitamovie ['Action', 'Adventure', 'Sci-Fi']
amazinggracemov ['Documentary', 'Music']
angelhasfallen ['Action', 'Thriller']
angrybirdsmovie ['Adventure', 'Animation', 'Comedy']
annabellemovie ['Horror', 'Mystery', 'Thriller']
annamovie ['Action', 'Thriller']
apollo11movie ['Documentary', 'History']
arcticdogsmovie ['Adventure', 'Animation', 'Comedy']
arcticmovie ['Adventure', 'Drama']
artofracingmovie ['Comedy', 'Drama', 'Romance']
avengers ['Action', 'Adventure', 'Sci-Fi']
beachbummovie ['Comedy']
bernadettefilm ['Comedy', 'Drama', 'Mystery']
blackandbluemovie ['Action', 'Crime', 'Drama']

In [12]:
len(genresMat)

166

In [7]:
write_json(genresMat, './genresMat.json')

In [8]:
li = []
for key, v in genresMat.items():
    li.append([key] + v)

In [10]:
df = pd.DataFrame(li, columns = ['movie'] + columns)
df = df.sort_values(by = 'movie')
df

Unnamed: 0,movie,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,History,Horror,Music,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,21bridgesmovie,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,47metersdown,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,abeautifuldaymovie,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,abominablemovie,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,adastramovie,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,wrinklestheclown,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
162,xmenmovies,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
163,yardiefilm,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
164,yesterdaymovie,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
# save on hot encoded genre
df.to_csv('./genresMat.csv', index = False)