# Machine Learning

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

import random

import dgl.function as fn
from dgl import DGLGraph

import time

import torch
import torch.nn as nn
import torch.nn.functional as F

### Loading the similarity matrices and generate graph

In [2]:
Data_path = 'Data/'

sim_mat = {}
names = ['keywords', 'genre', 'crew', 'cast']
for name in names:
    with open(Data_path+'csim_'+name, 'rb') as src:
        sim_mat[name] = pickle.load(src)
        print(f'>>> Loading {name} similarity matrix with shape {sim_mat[name].shape}')

>>> Loading keywords similarity matrix with shape (4802, 4802)
>>> Loading genre similarity matrix with shape (4802, 4802)
>>> Loading crew similarity matrix with shape (4802, 4802)
>>> Loading cast similarity matrix with shape (4802, 4802)


**Alternative 1** Combine the 4 similarity matrices equitably and pruned those with a similarity below 0.25

In [3]:
# Simple way
adj_mat = sim_mat['keywords'].copy()
for name in names[1:]:
    adj_mat = adj_mat.add(sim_mat[name], fill_value=0)
adj_mat = adj_mat/4    

threshold = 0.25
adj_mat[adj_mat < threshold] = 0

# Generate graph
G = DGLGraph(graph_data=adj_mat.values)

### Loading features and labels

In [10]:
features = ['budget','popularity','revenue','runtime','vote_average','vote_count','Nominations_GoldenGlobes']

IMDB_path = Data_path + 'nodes_attributes.csv'
IMDB = pd.read_csv(IMDB_path)
IMDB.drop(columns = ['Unnamed: 0','id'], inplace=True)
IMDB.set_index('title',inplace=True)

# features
IMDB_feat = IMDB[features]
tensor_feat = torch.FloatTensor(IMDB_feat.values)

# labels
IMDB_nom = IMDB['Nominations_Oscars'].copy()
IMDB_nom.loc[IMDB_nom > 0] = 1
tensor_nom = torch.LongTensor(IMDB_nom.values)

### Generate masks

In [5]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=0)

for prov_index, test_index in sss.split(tensor_feat, tensor_nom):
    prov_mask = prov_index
    test_mask = test_index

for train_index, val_index in sss.split(tensor_feat[prov_mask], tensor_nom[prov_mask]):
    train_mask = train_index
    val_mask = val_index

In [6]:
class LaplacianPolynomial(nn.Module):
    def __init__(self,
                 in_feats: int,
                 out_feats: int,
                 k: int,
                 dropout_prob: float,
                 norm=True):
        super().__init__()
        self._in_feats = in_feats
        self._out_feats = out_feats
        self._k = k
        self._norm = norm
        # Contains the weights learned by the Laplacian polynomial
        self.pol_weights = nn.Parameter(torch.Tensor(self._k + 1))
        # Contains the weights learned by the logistic regression (without bias)
        self.logr_weights = nn.Parameter(torch.Tensor(in_feats, out_feats))
        self.dropout = nn.Dropout(p=dropout_prob)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
        torch.manual_seed(0)
        torch.nn.init.xavier_uniform_(self.logr_weights, gain=0.01)
        torch.nn.init.normal_(self.pol_weights, mean=0.0, std=1e-3)

    def forward(self, graph, feat):
        r"""Compute graph convolution.

        Notes
        -----
        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
          dimensions, :math:`N` is the number of nodes.
        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
          the same shape as the input.

        Parameters
        ----------
        graph (DGLGraph) : The graph.
        feat (torch.Tensor): The input feature

        Returns
        -------
        (torch.Tensor) The output feature
        """
        feat = self.dropout(feat)
        graph = graph.local_var()
        
        # D^(-1/2)
        norm = torch.pow(graph.in_degrees().float().clamp(min=1), -0.5)
        shp = norm.shape + (1,) * (feat.dim() - 1)
        norm = torch.reshape(norm, shp)
        
        # mult W first to reduce the feature size for aggregation.
        feat = torch.matmul(feat, self.logr_weights) # X*Teta

        result = self.pol_weights[0] * feat.clone() # a0*L^0*X*Teta <-- fisrt polynomial weight a0 * L^0 * x

        for i in range(1, self._k + 1): # get the next polynomial coefficient (a1*L^1, a2*L^2, ..... ak*L^k) 
            old_feat = feat.clone()
            if self._norm:
                feat = feat * norm
            graph.ndata['h'] = feat
            # Feat is not modified in place
            graph.update_all(fn.copy_src(src='h', out='m'),
                             fn.sum(msg='m', out='h')) # update all nodes with msg function copy_src (get data from source node) and reduce function sum
            if self._norm:
                graph.ndata['h'] = graph.ndata['h'] * norm

            feat = old_feat - graph.ndata['h']
            result += self.pol_weights[i] * feat

        return result

    def extra_repr(self):
        """Set the extra representation of the module,
        which will come into effect when printing the model.
        """
        summary = 'in={_in_feats}, out={_out_feats}'
        summary += ', normalization={_norm}'
        return summary.format(**self.__dict__)

Once we have are model ready we just need to create a function that performs one step of our training loop, and another one that evaluates our model.

In [7]:
def train(model, g, features, labels, loss_fcn, train_mask, optimizer):
    model.train()  # Activate dropout
    
    logits = model(g, features) # prediction
    loss = loss_fcn(logits[train_mask], labels[train_mask])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

def evaluate(model, g, features, labels, mask):
    model.eval()  # Deactivate dropout
    with torch.no_grad():
        logits = model(g, features)[mask]  # only compute the evaluation set
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        acc = correct.item() * 1.0 / len(labels)
        f1 = f1_score(labels, indices)
        #acc = torch.sum((logits.round() == labels).diagonal()).item() * 1.0 / len(labels)
        return f1, acc

Choose the training parameters.

In [8]:
in_feats = len(features)
output = 2
pol_order = 3
lr = 0.02
weight_decay = 5e-6
n_epochs = 1000
p_dropout = 0.2

And train the classifier end to end.

In [11]:
model = LaplacianPolynomial(in_feats, output, pol_order, p_dropout)

loss_fcn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=lr,
                             weight_decay=weight_decay)

dur = []
for epoch in range(n_epochs):
    if epoch >= 3:
        t0 = time.time()
    loss = train(model, G, tensor_feat, tensor_nom, loss_fcn, train_mask, optimizer)
    if epoch >= 3:
        dur.append(time.time() - t0)
        
    f1, acc = evaluate(model, G, tensor_feat, tensor_nom, val_mask)
    
    if epoch%50 == 0:
        print("Epoch {:05d} | Time(s) {:.4f} | Train Loss {:.4f} | Val f1 {:.4%} | Val Accuracy {:.4%}". format(
                epoch+1, np.mean(dur), loss.item(), f1, acc))

print()
f1, acc = evaluate(model, G, tensor_feat, tensor_nom, test_mask)
print("Test Accuracy {:.4%}".format(acc))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Epoch 00001 | Time(s) nan | Train Loss 395.2351 | Val f1 35.0993% | Val Accuracy 49.0247%
Epoch 00051 | Time(s) 0.0065 | Train Loss 240.5186 | Val f1 31.4530% | Val Accuracy 47.8544%
Epoch 00101 | Time(s) 0.0070 | Train Loss 3.4966 | Val f1 25.6983% | Val Accuracy 65.4096%
Epoch 00151 | Time(s) 0.0067 | Train Loss 0.7440 | Val f1 25.5319% | Val Accuracy 59.0377%
Epoch 00201 | Time(s) 0.0065 | Train Loss 0.6160 | Val f1 9.5238% | Val Accuracy 77.7633%
Epoch 00251 | Time(s) 0.0065 | Train Loss 0.6144 | Val f1 15.8416% | Val Accuracy 77.8934%
Epoch 00301 | Time(s) 0.0064 | Train Loss 0.6328 | Val f1 1.2048% | Val Accuracy 78.6736%
Epoch 00351 | Time(s) 0.0063 | Train Loss 0.6147 | Val f1 10.2273% | Val Accuracy 79.4538%
Epoch 00401 | Time(s) 0.0062 | Train Loss 0.6182 | Val f1 2.3529% | Val Accuracy 78.4135%
Epoch 00451 | Time(s) 0.0061 | Train Loss 0.6281 | Val f1 20.8333% | Val Accuracy 75.2926%
Epoch 00501 | Time(s) 0.0061 | Train Loss 0.6161 | Val f1 20.1754% | Val Accuracy 76.3329%
E

In [None]:
logits = model(G, tensor_feat)[val_mask]