# Graph construction

In [1]:
import numpy as np
import pandas as pd
import pickle
import itertools
import matplotlib.pyplot as plt
import networkx as nx

Loading the similarity matrices and the dataframe

In [2]:
Data_path = 'Data/'

sim_mat = {}
names = ['keywords', 'genre', 'crew', 'cast']
for name in names:
    with open(Data_path+'csim_'+name, 'rb') as src:
        sim_mat[name] = pickle.load(src)
        print(f'>>> Loading {name} similaryty matrix of shape {sim_mat[name].shape}')

>>> Loading keywords similaryty matrix of shape (4803, 4803)
>>> Loading genre similaryty matrix of shape (4803, 4803)
>>> Loading crew similaryty matrix of shape (4803, 4803)
>>> Loading cast similaryty matrix of shape (4803, 4803)


Combine the 4 similarity matrices equitably (**to be optimized**) and pruned those with a similarity below 0.25

In [15]:
# Simple way
adj_mat = np.zeros(sim_mat['keywords'].shape)
for wi, name in zip([0.25, 0.25, 0.25, 0.25], names):
     adj_mat += wi*sim_mat[name].values
final_adjacency_mat = np.where(adj_mat < 0.25, 0, adj_mat)
weight_dict = {'names':names, 'weights':[0.25, 0.25, 0.25, 0.25]}

In [None]:
# Gridsearch way
names = list(sim_mat.keys()) # name of similarity matrix
mat_dim = sim_mat['keywords'].shape # shape of them
prune_threshold = 0.25 # threhold to prune the summed adjacency matrix

weight_range = [np.arange(0, 1.01, 0.25), \
                          np.arange(0, 1.01, 0.25), \
                          np.arange(0, 1.01, 0.25), \
                          np.arange(0, 1.01, 0.25)] # Range of weight to test
weights_list = [] # to store the output
clustering_coef = [] # to store output
eigenvals = [] # to store the eigenvalues

j=0
for w in itertools.product(*weight_range): # iterate over all combination of weights
    if sum(w) == 1: # if the weight sum up to 1 
        j+=1
        print(f'>>> Combination n°{j} with weights {w}')
        weights_list.append(w)
        adj_mat = np.zeros(mat_dim)
        for wi, name in zip(w, names): # sum the similarity matrices
            adj_mat += wi*sim_mat[name].values

        G=nx.from_numpy_matrix(np.where(adj_mat < prune_threshold, 0, adj_mat)) # prune adjacency matrix
        C = nx.average_clustering(G) # compute the average clustering coefficient
        eigenvals.append(nx.linalg.spectrum.normalized_laplacian_spectrum(G))
        
        print(f'\t--> Clustering coefficient : {C}')
        clustering_coef.append(C)

Selecting the best combination of weights

In [None]:
idx = np.argmin(np.array(eigenvals).sum(axis=1), axis=1) # where the sum of eigenvalues is minimal
weights = weights_list[idx]

final_adjacency_mat= np.zeros(mat_dim)
for wi, name in zip(weights, names): # sum the similarity matrices
    final_adjacency_mat+= wi*sim_mat[name].values
    
final_adjacency_mat = np.where(final_adjacency_mat < prune_threshold, 0, final_adjacency_mat)
weight_dict = {'names':names, 'weights':weights}

Save the adjacency matrix and the weights 

In [17]:
with open(Data_path+'Adjacency_matrix.pickle', 'wb') as handle:
    pickle.dump(final_adjacency_mat, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Adjacency matrix saved at '+Data_path+'Adjacency_matrix.pickle')
    
with open(Data_path+'Adjacency_weights.pickle', 'wb') as handle:
    pickle.dump(weight_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Adjacency weights saved at '+Data_path+'Adjacency_weights.pickle')

Adjacency matrix saved at Data/Adjacency_matrix.pickle
Adjacency weights saved at Data/Adjacency_weights.pickle
