In [1]:
import pathlib

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import utils.preprocess
from sklearn.model_selection import train_test_split

In [2]:
save_prefix = 'data/preprocessed_custom/IMDB_processed/'
num_ntypes = 3

In [3]:
# load raw data, delete movies with no actor or director
movies = pd.read_csv('data/raw/IMDB/movie_metadata.csv', encoding='utf-8').dropna(
    axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932 entries, 0 to 4931
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4915 non-null   object 
 1   director_name              4932 non-null   object 
 2   num_critic_for_reviews     4888 non-null   float64
 3   duration                   4919 non-null   float64
 4   director_facebook_likes    4932 non-null   float64
 5   actor_3_facebook_likes     4919 non-null   float64
 6   actor_2_name               4928 non-null   object 
 7   actor_1_facebook_likes     4932 non-null   float64
 8   gross                      4152 non-null   float64
 9   genres                     4932 non-null   object 
 10  actor_1_name               4932 non-null   object 
 11  movie_title                4932 non-null   object 
 12  num_voted_users            4932 non-null   int64  
 13  cast_total_facebook_likes  4932 non-null   int64

In [5]:
# extract labels, and delete movies with unwanted genres
# 0 for action, 1 for comedy, 2 for drama, -1 for others
labels = np.zeros((len(movies)), dtype=int)
for movie_idx, genres in movies['genres'].items():
    labels[movie_idx] = -1
    for genre in genres.split('|'):
        if genre == 'Action':
            labels[movie_idx] = 0
            break
        elif genre == 'Comedy':
            labels[movie_idx] = 1
            break
        elif genre == 'Drama':
            labels[movie_idx] = 2
            break
unwanted_idx = np.where(labels == -1)[0]
movies = movies.drop(unwanted_idx).reset_index(drop=True)
labels = np.delete(labels, unwanted_idx, 0)

In [6]:
# get director list and actor list
directors = list(set(movies['director_name'].dropna()))
directors.sort()
actors = list(set(movies['actor_1_name'].dropna().to_list() +
                  movies['actor_2_name'].dropna().to_list() +
                  movies['actor_3_name'].dropna().to_list()))
actors.sort()

In [8]:
# build the adjacency matrix for the graph consisting of movies, directors and actors
# 0 for movies, 1 for directors, 2 for actors
dim = len(movies) + len(directors) + len(actors)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(movies):len(movies)+len(directors)] = 1
type_mask[len(movies)+len(directors):] = 2

adjM = np.zeros((dim, dim), dtype=int)

print(f'dim: {dim}')

dim: 11616


In [15]:
len(directors)

2081

In [10]:
movies

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4273,Color,Anthony Vallone,,84.0,2.0,2.0,John Considine,45.0,,Crime|Drama,...,1.0,English,USA,PG-13,3250.0,2005.0,44.0,7.8,,4
4274,Color,Edward Burns,14.0,95.0,0.0,133.0,Caitlin FitzGerald,296.0,4584.0,Comedy|Drama,...,14.0,English,USA,Not Rated,9000.0,2011.0,205.0,6.4,,413
4275,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4276,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16


In [7]:

for movie_idx, row in movies.iterrows():
    if row['director_name'] in directors:
        director_idx = directors.index(row['director_name'])
        adjM[movie_idx, len(movies) + director_idx] = 1
        adjM[len(movies) + director_idx, movie_idx] = 1
    if row['actor_1_name'] in actors:
        actor_idx = actors.index(row['actor_1_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_2_name'] in actors:
        actor_idx = actors.index(row['actor_2_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_3_name'] in actors:
        actor_idx = actors.index(row['actor_3_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1

In [8]:
# extract bag-of-word representations of plot keywords for each movie
# X is a sparse matrix
vectorizer = CountVectorizer(min_df=2)
movie_X = vectorizer.fit_transform(movies['plot_keywords'].fillna('').values)
# assign features to directors and actors as the means of their associated movies' features
adjM_da2m = adjM[len(movies):, :len(movies)]
adjM_da2m_normalized = np.diag(1 / adjM_da2m.sum(axis=1)).dot(adjM_da2m)
director_actor_X = scipy.sparse.csr_matrix(adjM_da2m_normalized).dot(movie_X)
full_X = scipy.sparse.vstack([movie_X, director_actor_X])

In [9]:
expected_metapaths = [
    [(0, 1, 0), (0, 2, 0)],
    [(1, 0, 1), (1, 0, 2, 0, 1)],
    [(2, 0, 2), (2, 0, 1, 0, 2)]
]
# create the directories if they do not exist
for i in range(num_ntypes):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)
for i in range(num_ntypes):
    # get metapath based neighbor pairs
    neighbor_pairs = utils.preprocess.get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])
    # construct and save metapath-based networks
    G_list = utils.preprocess.get_networkx_graph(neighbor_pairs, type_mask, i)
    
    # save data
    # networkx graph (metapath specific)
    for G, metapath in zip(G_list, expected_metapaths[i]):
        nx.write_adjlist(G, save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist')
    # node indices of edge metapaths
    all_edge_metapath_idx_array = utils.preprocess.get_edge_metapath_idx_array(neighbor_pairs)
    for metapath, edge_metapath_idx_array in zip(expected_metapaths[i], all_edge_metapath_idx_array):
        np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)

# save data
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (movies, directors and actors) features
for i in range(num_ntypes):
    scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(i), full_X[np.where(type_mask == i)[0]])
# all nodes (movies, directors and actors) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# movie genre labels
np.save(save_prefix + 'labels.npy', labels)
# movie train/validation/test splits
rand_seed = 1566911444
train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=400, random_state=rand_seed)
train_idx, test_idx = train_test_split(train_idx, test_size=3478, random_state=rand_seed)
train_idx.sort()
val_idx.sort()
test_idx.sort()
np.savez(save_prefix + 'train_val_test_idx.npz',
         val_idx=val_idx,
         train_idx=train_idx,
         test_idx=test_idx)

(17446, 3)
(95102, 3)
(4278, 3)
(95102, 5)
(38476, 3)
(156928, 5)
