# Genre 轉為 Matrix

In [2]:
# Basic import
import os
import sys
import json
import shutil
import pandas as pd
import numpy as np

In [3]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

## 97部電影

In [3]:
# 讀取電影所屬的分類
genresDic = read_json('../genresDic.json')
len(genresDic)

166

In [8]:
# 存下排序後的分類
genres = read_json('../orderdListGenres.json')
print(len(genres), genres)

20 ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [6]:
# 類別轉存成matrix
genresMat = {}
for acc, value in genresDic.items():
    gs = genresDic[acc]
    print(acc, gs)
    glist = []
    for g in genres:
        if g in gs:
            glist.append(1)
        else:
            glist.append(0)
            
    genresMat[acc] = glist

21bridgesmovie ['Action', 'Crime', 'Drama']
47metersdown ['Adventure', 'Drama', 'Horror']
abeautifuldaymovie ['Biography', 'Drama']
abominablemovie ['Adventure', 'Animation', 'Comedy']
adastramovie ['Adventure', 'Drama', 'Mystery']
adogsjourneymovie ['Adventure', 'Comedy', 'Drama']
aftermathmovie ['Drama', 'Romance', 'War']
aftermovie ['Drama', 'Romance']
ahiddenlifefilm ['Biography', 'Drama', 'War']
alitamovie ['Action', 'Adventure', 'Sci-Fi']
amazinggracemov ['Documentary', 'Music']
angelhasfallen ['Action', 'Thriller']
angrybirdsmovie ['Adventure', 'Animation', 'Comedy']
annabellemovie ['Horror', 'Mystery', 'Thriller']
annamovie ['Action', 'Thriller']
apollo11movie ['Documentary', 'History']
arcticdogsmovie ['Adventure', 'Animation', 'Comedy']
arcticmovie ['Adventure', 'Drama']
artofracingmovie ['Comedy', 'Drama', 'Romance']
avengers ['Action', 'Adventure', 'Sci-Fi']
beachbummovie ['Comedy']
bernadettefilm ['Comedy', 'Drama', 'Mystery']
blackandbluemovie ['Action', 'Crime', 'Drama']

In [8]:
# 存下排序後的分類
write_json(genresMat, '../genresMat.json')
genresMat

## IMDb 2018電影

In [9]:
# 讀取 IMDb 的電影類別
genresDic = read_json('./input/trainGenreDic.json')
len(genresDic)

1465

In [11]:
genresMat_imdb = {}
for acc, value in genresDic.items():
    gs = genresDic[acc]
    print(acc, gs)
    glist = []
    for g in genres:
        if g in gs:
            glist.append(1)
        else:
            glist.append(0)
            
    genresMat_imdb[acc] = glist

tt7819668 ['Drama', 'Music', 'Romance']
tt7654144 ['Crime', 'Thriller']
tt7941422 ['Thriller']
tt9490414 ['Action', 'Adventure', 'Fantasy']
tt8798194 ['Music']
tt8486162 ['Comedy', 'Drama', 'Sci-Fi']
tt6245274 ['Sci-Fi']
tt9153044 ['Comedy', 'Drama']
tt9662290 ['Documentary']
tt9013842 ['Animation']
tt4177856 ['Animation', 'Family']
tt7341810 ['Drama', 'History']
tt8061078 ['Action', 'Crime', 'Drama']
tt8358682 ['Action', 'War']
tt6155374 ['Action', 'Adventure', 'Drama']
tt9408030 ['Horror']
tt8887958 ['Comedy']
tt6993200 ['Adventure', 'Comedy', 'Drama']
tt8595480 ['Drama', 'Musical', 'War']
tt8999494 ['Sci-Fi']
tt6714558 ['Horror']
tt8108214 ['Action', 'Comedy', 'Drama']
tt9140104 ['Music']
tt9085536 ['Comedy', 'Drama', 'Horror']
tt6931170 ['Drama', 'History', 'Mystery']
tt4530422 ['Action', 'Adventure', 'Horror']
tt8595434 ['Adventure', 'Animation', 'Comedy']
tt9569584 ['Drama', 'Mystery', 'Thriller']
tt3906724 ['Adventure', 'Drama', 'Family']
tt8141238 ['Drama', 'Family', 'Romance']

In [12]:
write_json(genresMat_imdb, './input/imdbGenresMat.json')
# genresMat

## 合併兩個dict

In [5]:
genresMat = read_json('../genresMat.json')
genresMat_imdb = read_json('./input/imdbGenresMat.json')

In [8]:
merge = genresMat.copy()
merge.update(genresMat_imdb)
len(merge)

1631

In [9]:
write_json(merge, './input/mergeGenresMat.json')

## 製作 test 的 ground truth matrix

In [4]:
test_data = pd.read_csv('./stopword/test_imdb.csv')
accs = test_data.username.tolist()

In [6]:
li = []
for acc in accs:
    li.append([acc] + genresMat[acc])

In [9]:
df = pd.DataFrame(li, columns = ['username'] + genres)
df

Unnamed: 0,username,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,History,Horror,Music,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,21bridgesmovie,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,47metersdown,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,abeautifuldaymovie,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,abominablemovie,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,adastramovie,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,wrinklestheclown,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
162,xmenmovies,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
163,yardiefilm,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
164,yesterdaymovie,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [10]:
df.to_csv('./input/true_df.csv', index = False)