# Graph construction

In [1]:
import numpy as np
import pandas as pd
import pickle
import itertools
import matplotlib.pyplot as plt
import networkx as nx

Loading the similarity matrices and the dataframe

In [2]:
Data_path = 'Data/'

sim_mat = {}
names = ['keywords', 'genre', 'crew', 'cast']
for name in names:
    with open(Data_path+'csim_'+name, 'rb') as src:
        sim_mat[name] = pickle.load(src)
        print(f'>>> Loading {name} similarity matrix with shape {sim_mat[name].shape}')

>>> Loading keywords similarity matrix with shape (4803, 4803)
>>> Loading genre similarity matrix with shape (4803, 4803)
>>> Loading crew similarity matrix with shape (4803, 4803)
>>> Loading cast similarity matrix with shape (4803, 4803)


**Alternative 1** Combine the 4 similarity matrices equitably (**to be optimized**) and pruned those with a similarity below 0.25

In [3]:
# Simple way
adj_mat = np.zeros(sim_mat['keywords'].shape)
for wi, name in zip([0.25, 0.25, 0.25, 0.25], names):
     adj_mat += wi*sim_mat[name].values
final_adjacency_mat = np.where(adj_mat < 0.25, 0, adj_mat)
weight_dict = {'names':names, 'weights':[0.25, 0.25, 0.25, 0.25]}

**Alternative 2** Grid search the best combination of weights (weights that sum up to 1) and then prune the adjacency matrix 

In [4]:
# Gridsearch way
names = list(sim_mat.keys()) # name of similarity matrix
mat_dim = sim_mat['keywords'].shape # shape of them
prune_threshold = 0.25 # threhold to prune the summed adjacency matrix

weight_range = [np.arange(0.125, 1.0, 0.125), \
                          np.arange(0.125, 1.0, 0.125), \
                          np.arange(0.125, 1.0, 0.125), \
                          np.arange(0.125, 1.0, 0.125)] # Range of weight to test
weights_list = [] # to store the output
clustering_coef = [] # to store output
eigenvals = [] # to store the eigenvalues
giant_comp_size = []

j=0
for w in itertools.product(*weight_range): # iterate over all combination of weights
    if sum(w) == 1: # if the weight sum up to 1 
        j+=1
        print(f'>>> Combination n°{j} with weights {w}')
        weights_list.append(w)
        adj_mat = np.zeros(mat_dim)
        for wi, name in zip(w, names): # sum the similarity matrices
            adj_mat += wi*sim_mat[name].values

        G=nx.from_numpy_matrix(np.where(adj_mat < prune_threshold, 0, adj_mat)) # prune adjacency matrix
        C = nx.average_clustering(G) # compute the average clustering coefficient
        n_gc = max(nx.connected_component_subgraphs(G), key=len).number_of_nodes()
        eigenvals.append(nx.linalg.spectrum.normalized_laplacian_spectrum(G))
        giant_comp_size.append(n_gc)
        
        print(f'\t--> Clustering coefficient : {C} \n\t--> Giant component size : {n_gc}')
        clustering_coef.append(C)

>>> Combination n°1 with weights (0.125, 0.125, 0.125, 0.625)
	--> Clustering coefficient : 0.0497875005370321 
	--> Giant component size : 19
>>> Combination n°2 with weights (0.125, 0.125, 0.25, 0.5)
	--> Clustering coefficient : 0.06932720621040421 
	--> Giant component size : 20
>>> Combination n°3 with weights (0.125, 0.125, 0.375, 0.375)
	--> Clustering coefficient : 0.09436658490374918 
	--> Giant component size : 24
>>> Combination n°4 with weights (0.125, 0.125, 0.5, 0.25)
	--> Clustering coefficient : 0.12299657238574094 
	--> Giant component size : 33
>>> Combination n°5 with weights (0.125, 0.125, 0.625, 0.125)
	--> Clustering coefficient : 0.15422879052267055 
	--> Giant component size : 45
>>> Combination n°6 with weights (0.125, 0.25, 0.125, 0.5)
	--> Clustering coefficient : 0.2854848597415816 
	--> Giant component size : 4128
>>> Combination n°7 with weights (0.125, 0.25, 0.25, 0.375)
	--> Clustering coefficient : 0.29668934374189415 
	--> Giant component size : 4114
>

Selecting the best combination of weights

In [None]:
#idx = np.argmin(np.array(eigenvals).sum(axis=1)) # where the sum of eigenvalues is minimal
#idx = np.argmax(np.array(clustering_coef))
idx = np.argmax(np.array(giant_comp_size))
weights = weights_list[idx]

final_adjacency_mat= np.zeros(mat_dim)
for wi, name in zip(weights, names): # sum the similarity matrices
    final_adjacency_mat+= wi*sim_mat[name].values
    
final_adjacency_mat = np.where(final_adjacency_mat < prune_threshold, 0, final_adjacency_mat)
weight_dict = {'names':names, 'weights':weights}

Save the adjacency matrix and the weights 

In [None]:
with open(Data_path+'Adjacency_matrix.pickle', 'wb') as handle:
    pickle.dump(final_adjacency_mat, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Adjacency matrix saved at '+Data_path+'Adjacency_matrix.pickle')
    
with open(Data_path+'Adjacency_weights.pickle', 'wb') as handle:
    pickle.dump(weight_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Adjacency weights saved at '+Data_path+'Adjacency_weights.pickle')