In [4]:
import pathlib

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
#import utils.preprocess

from sklearn.model_selection import train_test_split

In [5]:
import dgl
import torch

Using backend: pytorch


In [6]:
save_prefix = '../../data/preprocessed/IMDB_processed/'
num_ntypes = 3

In [23]:
# load raw data, delete movies with no actor or director
movies = pd.read_csv('../../data/raw/movie_metadata.csv', encoding='utf-8').dropna(
    axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)

In [24]:
# extract labels, and delete movies with unwanted genres
# 0 for action, 1 for comedy, 2 for drama, -1 for others
labels = np.zeros((len(movies)), dtype=int)
for movie_idx, genres in movies['genres'].iteritems():
    labels[movie_idx] = -1
    for genre in genres.split('|'):
        if genre == 'Action':
            labels[movie_idx] = 0
            break
        elif genre == 'Comedy':
            labels[movie_idx] = 1
            break
        elif genre == 'Drama':
            labels[movie_idx] = 2
            break
unwanted_idx = np.where(labels == -1)[0]
movies = movies.drop(unwanted_idx).reset_index(drop=True)
labels = np.delete(labels, unwanted_idx, 0)

In [25]:
# get director list and actor list
directors = list(set(movies['director_name'].dropna()))
directors.sort()
actors = list(set(movies['actor_1_name'].dropna().to_list() +
                  movies['actor_2_name'].dropna().to_list() +
                  movies['actor_3_name'].dropna().to_list()))
actors.sort()

In [26]:
# build edge lists from data
edge_list={('director','directed','movie'):[],('actor','played','movie'):[],
           ('movie','directed_by','director'):[],('movie','played_by','actor'):[]}

for movie_idx, row in movies.iterrows():
    if row['director_name'] in directors:
        director_idx = directors.index(row['director_name'])
        edge_list[('director','directed','movie')]+=[(director_idx,movie_idx)]
        edge_list[('movie','directed_by','director')]+=[(movie_idx,director_idx)]
    if row['actor_1_name'] in actors:
        actor_idx = actors.index(row['actor_1_name'])
        edge_list[('actor','played','movie')]+=[(actor_idx,movie_idx)]
        edge_list[('movie','played_by','actor')]+=[(movie_idx,actor_idx)]
    if row['actor_2_name'] in actors:
        actor_idx = actors.index(row['actor_2_name'])
        edge_list[('actor','played','movie')]+=[(actor_idx,movie_idx)]
        edge_list[('movie','played_by','actor')]+=[(movie_idx,actor_idx)]
    if row['actor_3_name'] in actors:
        actor_idx = actors.index(row['actor_3_name'])
        edge_list[('actor','played','movie')]+=[(actor_idx,movie_idx)]
        edge_list[('movie','played_by','actor')]+=[(movie_idx,actor_idx)]

In [27]:
len(np.unique(np.array(edge_list[('actor','played','movie')])[:,0]))

5257

In [28]:
# create dgl heterograph
g = dgl.heterograph(edge_list)
print(g)

Graph(num_nodes={'actor': 5257, 'director': 2081, 'movie': 4278},
      num_edges={('actor', 'played', 'movie'): 12828, ('director', 'directed', 'movie'): 4278, ('movie', 'directed_by', 'director'): 4278, ('movie', 'played_by', 'actor'): 12828},
      metagraph=[('actor', 'movie', 'played'), ('movie', 'director', 'directed_by'), ('movie', 'actor', 'played_by'), ('director', 'movie', 'directed')])


In [29]:
# extract bag-of-word representations of plot keywords for each movie
# X is a sparse matrix
vectorizer = CountVectorizer(min_df=2)
movie_X = vectorizer.fit_transform(movies['plot_keywords'].fillna('').values)
print(movie_X)

  (0, 230)	1
  (0, 1125)	1
  (0, 1673)	1
  (0, 1843)	1
  (1, 1170)	1
  (1, 1677)	2
  (1, 2164)	1
  (1, 2064)	1
  (1, 2483)	1
  (2, 343)	1
  (2, 948)	1
  (2, 2425)	1
  (2, 2578)	1
  (2, 2736)	1
  (3, 2736)	1
  (3, 753)	1
  (3, 1380)	1
  (3, 2095)	1
  (3, 1918)	1
  (3, 2085)	1
  (4, 102)	1
  (4, 119)	1
  (4, 546)	1
  (4, 2961)	1
  (4, 1654)	1
  :	:
  (4271, 1041)	1
  (4271, 2746)	1
  (4271, 2769)	1
  (4271, 2822)	1
  (4271, 2770)	1
  (4271, 1888)	1
  (4271, 1387)	1
  (4271, 1431)	1
  (4272, 193)	1
  (4272, 747)	1
  (4272, 1221)	1
  (4272, 1218)	1
  (4272, 1671)	1
  (4274, 129)	1
  (4274, 419)	1
  (4274, 3042)	1
  (4274, 802)	1
  (4274, 468)	1
  (4274, 1712)	1
  (4275, 2140)	1
  (4275, 2749)	1
  (4275, 1102)	1
  (4275, 3029)	1
  (4275, 2828)	1
  (4275, 2113)	1


In [30]:
# build the adjacency matrix for the graph consisting of movies, directors and actors
# 0 for movies, 1 for directors, 2 for actors
dim = len(movies) + len(directors) + len(actors)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(movies):len(movies)+len(directors)] = 1
type_mask[len(movies)+len(directors):] = 2

adjM = np.zeros((dim, dim), dtype=int)
for movie_idx, row in movies.iterrows():
    if row['director_name'] in directors:
        director_idx = directors.index(row['director_name'])
        adjM[movie_idx, len(movies) + director_idx] = 1
        adjM[len(movies) + director_idx, movie_idx] = 1
    if row['actor_1_name'] in actors:
        actor_idx = actors.index(row['actor_1_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_2_name'] in actors:
        actor_idx = actors.index(row['actor_2_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_3_name'] in actors:
        actor_idx = actors.index(row['actor_3_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
# assign features to directors and actors as the means of their associated movies' features
adjM_da2m = adjM[len(movies):, :len(movies)]
adjM_da2m_normalized = np.diag(1 / adjM_da2m.sum(axis=1)).dot(adjM_da2m)
director_actor_X = scipy.sparse.csr_matrix(adjM_da2m_normalized).dot(movie_X)
director_X=director_actor_X[:len(directors),:]
actor_X=director_actor_X[len(directors):,:]
full_X = scipy.sparse.vstack([movie_X, director_actor_X])

In [31]:
# load the features in the data of the dglnodes
g.nodes['movie'].data['h_f']=torch.FloatTensor(movie_X.todense())
g.nodes['actor'].data['h_f']=torch.FloatTensor(actor_X.todense())
g.nodes['director'].data['h_f']=torch.FloatTensor(director_X.todense())

In [32]:
#save data
import pickle,os

IMDB_DIR='../../data/imdb_preprocessed/'
pickle.dump(g, open(os.path.join(IMDB_DIR, "graph.pickle"), "wb"),
                protocol=4);
pickle.dump(labels, open(os.path.join(IMDB_DIR, "labels.pickle"), "wb"),
                protocol=4);
pickle.dump(edge_list, open(os.path.join(IMDB_DIR, "edge_list.pickle"), "wb"),
                protocol=4);
feat={}
feat['movie']=torch.FloatTensor(movie_X.todense())
feat['actor']=torch.FloatTensor(actor_X.todense())
feat['director']=torch.FloatTensor(director_X.todense())
pickle.dump(feat, open(os.path.join(IMDB_DIR, "features.pickle"), "wb"),
                protocol=4);

In [33]:
rand_seed = 1566911444
train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=400, random_state=rand_seed)
train_idx, test_idx = train_test_split(train_idx, test_size=3478, random_state=rand_seed)
train_idx.sort()
val_idx.sort()
test_idx.sort()
np.savez(IMDB_DIR + 'train_val_test_idx.npz',
         val_idx=val_idx,
         train_idx=train_idx,
         test_idx=test_idx)