# Genre 轉為 Matrix

In [1]:
# Basic import
import os
import sys
import json
import shutil
import pandas as pd
import numpy as np

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

In [3]:
# 讀取固定的類別順序
genres = read_json('../orderedListGenres.json')
print(len(genres), genres)

20 ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [4]:
# 讀取電影所屬的分類
genresMat = read_json('../genresMat.json')
len(genresMat)

165

# Extract Poster

## Copy Poster to Poster/

In [None]:
PATH = '../Profiles_165/'
POSTER = './Poster/'
newPath(POSTER)

In [None]:
li = []
for folder in os.listdir(PATH):
    print(folder)
    src = PATH + folder + '/poster.jpeg'
    name = folder + '.jpg'
    dst = POSTER + name
    shutil.copy(src, dst)
    li.append([src] + [name] + genresMat[folder])

## Filename to genre matrix dataframe

In [None]:
df = pd.DataFrame(li, columns = ['from', 'filename'] + genres).sort_values(by='filename')
df

In [None]:
df.to_csv('./poster2genreMat.csv', index = 0)

# Extract all img and cover in posts

## Copy from Post/ to IGimg/

In [5]:
PATH = '../Posts_165/'
IG_IMG = './IGimg/'
newPath(IG_IMG)

In [6]:
li = []
for folder in os.listdir(PATH):
    print(folder)
    postList = os.listdir(PATH + folder) # all posts for each account
    length = len(postList)
    i = 0
    for postID in postList:
        postPath = PATH + folder + '/' + postID + '/'
        post = read_json(postPath + 'post.json')
        writeProgress('Progress:', i, length)
        
        if post['media_type'] in [1, 8]:
            for file in os.listdir(postPath):
                if 'img' in file:
                    src = postPath + file
                    dst = IG_IMG + '/'
                    count = len(os.listdir(dst))
                    name = folder + '_' + str(count) + '.jpg'
                    shutil.copy(src, dst + name)
                    li.append([src] + [name] + genresMat[folder])
                    
        elif post['media_type'] == 2:
            for file in os.listdir(postPath):
                if 'img' in file:
                    src = postPath + file
                    dst = IG_IMG + '/'
                    count = len(os.listdir(dst))
                    name = folder + '_' + str(count) + '.jpg'
                    shutil.copy(src, dst + name)
                    li.append([src] + [name] + genresMat[folder])
                    
        else:
            print('Unknown media_type', post['media_type'])
            
        i += 1

artofracingmovie
hotelmumbaifilm
lighthousemovie
littlewoodsfilm
fightingwmyfam%
playingwithfire
rocketmanmovie%
catsmovie99.00%
xmenmovies7.62%
scarystoriesmovie
toystory:98.67%
brittanyrunsmov
pomsmovie98.48%
brightburnmovie
shaftmovie8.97%
theprodigymovie
petsematarymovie
missinglinkfilm
midwaymovie.24%
adastramovie53%
beachbummovie9%
peanutbutterfalcon
arcticmovie.45%
disneymaleficent
wrinklestheclown
wonderparkmovie
onthebasisofsex
angelhasfallen%
everybodyknowsmovie
thehighwaymen8%
playmobilthemovie
alitamovie8.04%
harrietfilm.72%
blackchristmas%
overcomermovie%
theupsidefilm8%
littlethemovie%
disneyaladdin5%
lastblackmansf%
lordsofchaosmovie
countdown97.96%
thehummingbirdprojectfilm
booksmart97.22%
blindedbythelightmovie
thenightingalefilm
escaperoom5.00%
aftermathmovie%
hustlemovie.04%
abominablemovie
serenityfilm24%
yesterdaymovie%
zombieland9.07%
queenandslim32%
lalloronamovie%
dontletgomovie%
motherlessbklyn
lionking:98.75%
jojorabbitmovie
hellboymovie25%
longshotmovie0%
the

## Filename to genre matrix dataframe

In [7]:
df = pd.DataFrame(li, columns = ['from', 'filename'] + genres).sort_values(by='filename')
df

Unnamed: 0,from,filename,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,History,Horror,Music,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
22217,../Posts_165/21bridgesmovie/217501830640240439...,21bridgesmovie_22217.jpg,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
22218,../Posts_165/21bridgesmovie/217791736649498418...,21bridgesmovie_22218.jpg,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
22219,../Posts_165/21bridgesmovie/216903847292888510...,21bridgesmovie_22219.jpg,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
22220,../Posts_165/21bridgesmovie/219893585468032379...,21bridgesmovie_22220.jpg,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
22221,../Posts_165/21bridgesmovie/218219116847723486...,21bridgesmovie_22221.jpg,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7839,../Posts_165/zombieland/2137359683396335880_91...,zombieland_7839.jpg,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7840,../Posts_165/zombieland/2137359683396335880_91...,zombieland_7840.jpg,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7841,../Posts_165/zombieland/2137359683396335880_91...,zombieland_7841.jpg,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7842,../Posts_165/zombieland/2218432031066716113_91...,zombieland_7842.jpg,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [8]:
df.to_csv('./filename2genreMat.csv', index = 0)

## Count the number for each genre

In [None]:
df = pd.read_csv('./filename2genreMat.csv')
df

In [None]:
genreCount = pd.DataFrame(df.sum(axis = 0), columns = ['count'])
genreCount = genreCount.drop('filename')
genreCount = genreCount.drop('from')

In [None]:
genreCount = genreCount.sort_values(by = 'count', ascending = True)
genreCount

# Select train and test part

In [None]:
# init 
total = {}
for g in genreCount.index:
    total[g] = 0
print(total)
df = pd.read_csv('./filename2genreMat.csv')
print(df.shape)

In [None]:
from IPython.display import display
trainingset = pd.DataFrame()
testingset = pd.DataFrame()
for g, row in genreCount.iterrows():
    print('current DF:', df.shape)
    print(g)
    
    remain = int(500 - total[g])
    
    newdf = df.loc[df[g] == 1]
    df = df.drop(newdf.index)
    print(newdf.shape)
#     display(newdf)
    
    if remain > 0:
        try:
            traindf = newdf.sample(n = remain)
        except ValueError:
            traindf = newdf
    else:
        continue
    
#     display(traindf)

    print(traindf.shape)
    trainingset = trainingset.append(traindf)
    testingset = testingset.append(newdf.drop(traindf.index))
    
    countdf = pd.DataFrame(traindf.sum(axis = 0), columns = ['count']).drop('filename')
#     display(countdf)
    
    for genre, row in countdf.iterrows():
        total[genre] += row['count']
        
    print(total)
    
    print('==================================================')

In [None]:
total

In [None]:
import matplotlib.pyplot as plt

names = list(total.keys())
values = list(total.values())

plt.figure(figsize=(20, 5))
plt.ylim(0, 2000) 
plt.bar(names, values, align='center', width=0.5)

for a,b in zip(names, values):  
    plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=10)
    
plt.show()

In [None]:
trainingset.to_csv('./Sample/input_df/trainMatrix.csv', index = 0)
trainingset

In [None]:
testingset.to_csv('./Sample/input_df/testMatrix.csv', index = 0)
testingset