# Predicting 8 new movies

In [1]:
import numpy as np
import pandas as pd
import ast
import pickle

import sklearn.metrics
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_fscore_support, accuracy_score
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import random

import dgl.function as fn
from dgl import DGLGraph
import dgl.nn.pytorch as dgl_nn
import dgl.transform as dgl_transform

import time

import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

Data_path = '../Data/'

In [2]:
def get_list(df, col, min_nbr=0, key='name'):
    """
    Get the number of unique values in the dictionnaries of the columns col from the data. 
    INPUT
        |---- data [pandas Dataframe] the dataframe with the data 
        |---- col [string array] column name to count unique values
        |---- min_nbr [int] the minimum number of apperance of the value to be kepts
    OUTPUT 
        |---- l [list] list of unique values 
    """
    tmp = df.copy()
    tmp[col] = tmp[col].apply(lambda x : [value[key] for value in ast.literal_eval(x)])
    tmp = tmp[['title',col]].explode(col)
    tmp = tmp[col].value_counts()
    return list(tmp[tmp > min_nbr].index)

def add_dummy_features(df, col, min_nbr, key='name'):
    """ 
    Convert the columns col from the dataframe df as dummy variable 
    for each word appearing more that min_nbr.
    INPUT
        |---- df [pandas Dataframe] the dataframe with the data 
        |---- col [string array] column name to dummify
        |---- min_nbr [int] the minimum number of apperance of the value to be kepts
    OUTPUT 
        |---- df [pandas Dataframe] the dataframe with the data dummified
    """
    # get the list of possible value in col
    val_list = get_list(df, col, min_nbr=min_nbr, key=key)
    # keep only the value in val_list
    X = df[col].apply(lambda x : [value[key] for value in ast.literal_eval(x)])
    X = X.apply(lambda x : [val for val in x if val in val_list])
    # get the list as dummy variable
    tmp = pd.get_dummies(X.apply(pd.Series), prefix='', prefix_sep='').sum(level=0, axis=1)
    # add the new feature to the dataframe
    return pd.concat([df, tmp], axis=1).drop(columns=[col])

def do_standardisation(data, train_mask):
    '''
    DESCRIPTION: standardise features to zero mean and unit variance
    INPUT: 
        |--- data: [np.darray] feature matrix
        |--- train_mask: [list] indices of train samples
        |--- val_mask: [list] indices of validation samples
        |--- test_mask: [list] indices of test samples
    OUTPUT:
        |--- data: [np.darray] feature matrix with standardized feature columns
    '''
    scaler = StandardScaler()
    scaler.fit(data.iloc[train_mask].to_numpy())
    return pd.DataFrame(data = scaler.transform(data.to_numpy()),columns=data.columns)

def train(model, g, features, labels, train_mask, loss_fcn, optimizer):
    """ 
    DESCRIPTION : Train and update model classification performances with training set
    INPUT:
        |--- model: [] classification model to train
        |--- g: [DGLgraph] DeepGraphLearning graph object
        |--- features: [FloatTensor] 2D tensor containing samples' features
        |--- labels: [LongTensor] 1D tensor containing samples' labels (0-1)
        |--- train_mask: [np.array] indices of training set
        |--- loss_fcn: pytorch loss function chosen for model training
        |--- optimizer: pytorch model optimizer 
    OUTPUT:
        |--- loss: [float] value of loss function for the model at current state
    """
    model.train()  
    
    pred = model(g, features)[train_mask] 
    loss = loss_fcn(pred, labels[train_mask])
    _, indices = torch.max(pred, dim=1)
    acc = sklearn.metrics.accuracy_score(labels[train_mask], indices.numpy(), normalize=True, sample_weight=None)
    pre,rec,f1,sup = precision_recall_fscore_support(labels[train_mask],indices.numpy())
    optimizer.zero_grad()    
    loss.backward()

    optimizer.step()

    return loss, pre[1], rec[1], f1[1], sup[1], acc
    
def evaluate(model, g, features, mask, labels):
    """ 
    DESCRIPTION : Evaluate model classification performance on validation set 
    INPUT:
        |--- model: [] classification model to evaluate
        |--- g: [DGLgraph] DeepGraphLearning graph object
        |--- features: [FloatTensor] 2D tensor containing samples' features
        |--- labels: [LongTensor] 1D tensor containing samples' labels (0-1)
        |--- mask: [np.array] indices of validation set
    OUTPUT:
        |--- acc: [float] classification accuracy
        |--- recall: [float] classification recall
        |--- precision: [float] classification precision
        |--- f1: [float] classification f1 score
    """
    model.eval() 
    
    with torch.no_grad():
        pred = model(g, features)[mask]  
        labels = labels[mask]
        _, indices = torch.max(pred, dim=1)
        acc = sklearn.metrics.accuracy_score(labels, indices.numpy(), normalize=True, sample_weight=None)
        pre,rec,f1,sup = precision_recall_fscore_support(labels,indices.numpy())
        C = sklearn.metrics.confusion_matrix(labels, indices.numpy())
        
        return acc, pre[1], rec[1], f1[1], sup[1], C

## Load Cosine Similarity Matrix

In [3]:
with open(Data_path+'csim_all_2020', 'rb') as src:
        sim_mat = pickle.load(src)
        print(f'>>> Loading csim_all_2020 similarity matrix with shape {sim_mat.shape}')

>>> Loading csim_all_2020 similarity matrix with shape (4810, 4810)


Construct the Adjacency matrix : pruning at 0.25

In [4]:
A = sim_mat.values
# prune it at 0.25 (remove all similarity smaller than 0.25)
A = np.where(A <= 0.25, 0, A)

Create the dgl graph

In [5]:
# Generate graph
G = DGLGraph(graph_data=A)
G = dgl_transform.add_self_loop(G)

## Load the attritubes and make features, lables and index dataframe

In [6]:
df = pd.read_csv(Data_path + 'merged_data_2020.csv', index_col=0)

In [7]:
# features
features_df = df[['title', 'budget', 'genres', 'popularity', 'revenue','runtime','vote_average','vote_count']]
features_df = add_dummy_features(features_df, 'genres', min_nbr=0).drop(columns=('title')).reset_index(drop=True)
# labels
labels_df = df[['Nominations', 'Awards']].reset_index(drop=True)
# index names 
nodes_name = df[['title']].reset_index(drop=True)

In [8]:
# create the binary label
IMDB_nom = labels_df['Nominations'].copy()
IMDB_nom.loc[IMDB_nom > 0] = 1
# Checking class imbalance
IMDB_nom.value_counts() # 18.263 % of CLASS 1

0.0    3728
1.0    1074
Name: Nominations, dtype: int64

## Log-Transform of some features

In [9]:
# Transformation of budget, popularity, revenue and vote_count features
features_to_transform = features_df[['budget', 'popularity', 'revenue', 'vote_count']]
feat_names = ['budget', 'popularity', 'revenue', 'vote_count']
transformed_feat = features_df.copy()
transformed_feat.loc[:,feat_names] = (np.log(features_to_transform.mask(features_to_transform <=0)).fillna(0))#np.log(features_to_transform)#.replace(-np.inf, 0)#, where=(features_to_transform>0))

--> feature is given in `transformed_feat` and the labels are given in `IMDB_nom`

## Split the data in train and test sets
Train on the 4802 nodes for which we know the nomination. 
<br>Test if the 8 popular movie of 2019 are predicted as nominated.

In [10]:
train_mask = np.arange(0,4802,1)
test_mask = np.arange(4802,A.shape[0],1)

## Standardize

In [11]:
# Standardization of feature matrix
transformed_feat_std = do_standardisation(transformed_feat, train_mask)

## Create Tensors 

In [12]:
# feature tensors
tensor_data_std = torch.FloatTensor(transformed_feat_std.values)
# labels tensors
tensor_labels = torch.LongTensor(IMDB_nom.values)

## Training the three models

### Logistic Regression

In [13]:
clf = LogisticRegression(C= 1,random_state = 0,solver = 'lbfgs').fit(transformed_feat_std.iloc[train_mask].to_numpy(),IMDB_nom.values[train_mask])
train_pred = clf.predict(transformed_feat_std.iloc[train_mask].to_numpy())

# Train set results
tr_pre_log,tr_rec_log,tr_f1_log,tr_sup_log = precision_recall_fscore_support(IMDB_nom.values[train_mask],train_pred)
tr_acc_log = sklearn.metrics.accuracy_score(IMDB_nom.values[train_mask], train_pred, normalize=True, sample_weight=None)

print('Training set:')
print('>>> Precision: {:0.4}'.format(tr_pre_log[1]))
print('>>> Recall: {:0.4}'.format(tr_rec_log[1]))
print('>>> F1: {:0.4}'.format(tr_f1_log[1]))
print('>>> Support: {:}'.format(tr_sup_log[1]))
print('')

Training set:
>>> Precision: 0.6909
>>> Recall: 0.487
>>> F1: 0.5713
>>> Support: 1074



### Graph Filter + Logistic Regression

In [14]:
class LaplacianPolynomial(nn.Module):
    def __init__(self,
                 in_feats: int,
                 out_feats: int,
                 k: int,
                 dropout_prob: float,
                 norm=True):
        super().__init__()
        self._in_feats = in_feats
        self._out_feats = out_feats
        self._k = k
        self._norm = norm
        # Contains the weights learned by the Laplacian polynomial
        self.pol_weights = nn.Parameter(torch.Tensor(self._k + 1))
        # Contains the weights learned by the logistic regression (without bias)
        self.logr_weights = nn.Parameter(torch.Tensor(in_feats, out_feats))
        self.dropout = nn.Dropout(p=dropout_prob)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
        torch.manual_seed(0)
        torch.nn.init.xavier_uniform_(self.logr_weights, gain=0.01)
        torch.nn.init.normal_(self.pol_weights, mean=0.0, std=1e-3)

    def forward(self, graph, feat):
        r"""Compute graph convolution.

        Notes
        -----
        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
          dimensions, :math:`N` is the number of nodes.
        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
          the same shape as the input.

        Parameters
        ----------
        graph (DGLGraph) : The graph.
        feat (torch.Tensor): The input feature

        Returns
        -------
        (torch.Tensor) The output feature
        """
        feat = self.dropout(feat)
        graph = graph.local_var()

        norm = torch.pow(graph.in_degrees().float().clamp(min=1), -0.5)
        shp = norm.shape + (1,) * (feat.dim() - 1)
        norm = torch.reshape(norm, shp)

        # mult W first to reduce the feature size for aggregation.
        feat = torch.matmul(feat, self.logr_weights) # X*Teta

        result = self.pol_weights[0] * feat.clone() # a0*L^0*X*Teta <-- fisrt polynomial weight a0 * L^0 * x

        for i in range(1, self._k + 1): # get the next polynomial coefficient (a1*L^1, a2*L^2, ..... ak*L^k) 
            old_feat = feat.clone()
            if self._norm:
                feat = feat * norm
            graph.ndata['h'] = feat
            # Feat is not modified in place
            graph.update_all(fn.copy_src(src='h', out='m'),
                             fn.sum(msg='m', out='h')) # update all nodes with msg function copy_src (get data from source node) and reduce function sum
            if self._norm:
                graph.ndata['h'] = graph.ndata['h'] * norm

            feat = old_feat - graph.ndata['h']
            result += self.pol_weights[i] * feat

        return result

    def extra_repr(self):
        """Set the extra representation of the module,
        which will come into effect when printing the model.
        """
        summary = 'in={_in_feats}, out={_out_feats}'
        summary += ', normalization={_norm}'
        return summary.format(**self.__dict__)

In [15]:
# Best Model based on F1 score
pol_order = 3 
lr = 0.3
weight_decay = 5e-05
n_epochs = 1000 
p_dropout = 0.2 
n_classes = 2
in_feats=tensor_data_std.shape[1]

true_ratio = 1074/4802 # <-- fraction of Nominations
weights_loss = torch.FloatTensor([true_ratio, 1-true_ratio]) # to rebalance classes

# Training Laplacian Polynomial Graph Filter & Logistic Regression with best hyperparameters
model_GF_LR = LaplacianPolynomial(in_feats, n_classes, pol_order, p_dropout)

loss_fcn = torch.nn.CrossEntropyLoss(weight=weights_loss)
optimizer = torch.optim.Adam(model_GF_LR.parameters(),lr=lr,weight_decay=weight_decay)

losses_logF = []
f1_tr = []
for epoch in range(n_epochs):
    loss, pre_tr_logF, rec_tr_logF, f1_tr_logF, sup_tr_logF, acc_tr_logF = train(model_GF_LR, G, tensor_data_std, tensor_labels, train_mask, loss_fcn, optimizer)
    losses_logF.append(loss)
    f1_tr.append(f1_tr_logF)

    if (epoch+1)%50 == 0:
        print("Epoch {:04d} | Train Loss {:.4f} | Train precision {:.4%} | Train recall {:.4%} | Train F1 {:.4%}". format(epoch+1, loss.item(), pre_tr_logF, rec_tr_logF, f1_tr_logF))

Epoch 0050 | Train Loss 0.5129 | Train precision 39.8190% | Train recall 90.1304% | Train F1 55.2354%
Epoch 0100 | Train Loss 0.5069 | Train precision 39.1093% | Train recall 89.9441% | Train F1 54.5147%
Epoch 0150 | Train Loss 0.5053 | Train precision 39.8512% | Train recall 89.7579% | Train F1 55.1961%
Epoch 0200 | Train Loss 0.5053 | Train precision 39.8758% | Train recall 89.6648% | Train F1 55.2021%
Epoch 0250 | Train Loss 0.5114 | Train precision 39.7605% | Train recall 89.6648% | Train F1 55.0915%
Epoch 0300 | Train Loss 0.5031 | Train precision 40.1490% | Train recall 90.3166% | Train F1 55.5874%
Epoch 0350 | Train Loss 0.5118 | Train precision 40.3253% | Train recall 90.0372% | Train F1 55.7028%
Epoch 0400 | Train Loss 0.5063 | Train precision 40.1224% | Train recall 91.5270% | Train F1 55.7889%
Epoch 0450 | Train Loss 0.5044 | Train precision 39.5559% | Train recall 89.5717% | Train F1 54.8774%
Epoch 0500 | Train Loss 0.5090 | Train precision 39.7709% | Train recall 90.5028% 

### Graph Convolutional Network

In [16]:
class Linear_GNN(nn.Module):
    def __init__(self, in_feats: int, out_feats: int, first_layer_size: int, hidden_size: int):
        super().__init__()
        self._in_feats = in_feats
        self._out_feats = out_feats
        self._first_layer_size = first_layer_size
        self._hidden_size = hidden_size

        layer_size = 128
        self.linear = nn.Linear(self._in_feats, self._first_layer_size)
        self.gcn1 = dgl_nn.conv.GraphConv(self._first_layer_size, layer_size, activation=F.relu)
        self.gcn2 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn3 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn4 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn5 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn6 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn7 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn8 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn9 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn10 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.gcn11 = dgl_nn.conv.GraphConv(layer_size, layer_size, activation=F.relu)
        self.linear1 = nn.Linear(layer_size, self._hidden_size)
        self.linear2 = nn.Linear(self._hidden_size, self._out_feats)
        
    def forward(self, graph, feat):
        h = F.relu(self.linear(feat))
        h = self.gcn1(graph, h)
        h = self.gcn2(graph, h)
        h = self.gcn3(graph, h)
        h = self.gcn4(graph, h)
        h = self.gcn5(graph, h)
        h = self.gcn6(graph, h)
        h = self.gcn7(graph, h)
        h = self.gcn8(graph, h)
        h = self.gcn9(graph, h)
        h = self.gcn10(graph, h)
        h = self.gcn11(graph, h)
        h = self.linear1(h)
        h = F.relu(h)
        h = self.linear2(h)
        h = F.log_softmax(h, dim=1)
        return h 

In [17]:
# Best Model based on F1 score 
in_feats = tensor_data_std.shape[1]
out_feats = 2
n_epochs = 1000
learning_rate = 1e-3
first_layer_size = 16
hidden_size = 512
weight_decay = 0 
true_ratio = 1074/4802 # <-- fraction of Nominations
weights_loss = torch.FloatTensor([true_ratio, 1-true_ratio]) # to rebalance classes

In [18]:
# Training Linear GNN model with best hyperparameters
model_GCN = Linear_GNN(in_feats, out_feats, first_layer_size, hidden_size)

loss_fcn = torch.nn.CrossEntropyLoss(weight=weights_loss)
optimizer = torch.optim.Adam(model_GCN.parameters(),lr=learning_rate, weight_decay=weight_decay)
losses_tr_GNN = []

for epoch in range(n_epochs):
    loss, pre_tr_GNN, rec_tr_GNN, f1_tr_GNN, sup_tr_GNN, acc_tr_GNN = train(model_GCN, G, tensor_data_std, tensor_labels, train_mask, loss_fcn, optimizer)
    losses_tr_GNN.append(loss.item())
    
    if (epoch+1)%50 == 0:
        print("Epoch {:04d} | Train Loss {:.4f} | Train precision {:.4%} | Train recall {:.4%} | Train F1 {:.4%}". format(epoch+1, loss.item(), pre_tr_GNN, rec_tr_GNN, f1_tr_GNN))

  'precision', 'predicted', average, warn_for)


Epoch 0050 | Train Loss 0.5921 | Train precision 36.4827% | Train recall 72.6257% | Train F1 48.5679%
Epoch 0100 | Train Loss 0.5817 | Train precision 39.8086% | Train recall 65.8287% | Train F1 49.6140%
Epoch 0150 | Train Loss 0.5883 | Train precision 36.7128% | Train recall 76.5363% | Train F1 49.6227%
Epoch 0200 | Train Loss 0.5802 | Train precision 40.3182% | Train recall 68.4358% | Train F1 50.7421%
Epoch 0250 | Train Loss 0.5579 | Train precision 42.2424% | Train recall 69.4600% | Train F1 52.5352%
Epoch 0300 | Train Loss 0.5490 | Train precision 43.3995% | Train recall 69.1806% | Train F1 53.3381%
Epoch 0350 | Train Loss 0.5770 | Train precision 38.2495% | Train recall 73.6499% | Train F1 50.3501%
Epoch 0400 | Train Loss 0.5460 | Train precision 46.0224% | Train recall 65.1769% | Train F1 53.9499%
Epoch 0450 | Train Loss 0.5400 | Train precision 45.1151% | Train recall 67.5047% | Train F1 54.0843%
Epoch 0500 | Train Loss 0.5400 | Train precision 45.8097% | Train recall 67.6909% 

## Predict new Movies

In [19]:
LR_pred = clf.predict(transformed_feat_std.iloc[test_mask].to_numpy())
LR_pred_df = pd.DataFrame(data=LR_pred, index=nodes_name.iloc[test_mask,0], columns=['Nominated'])
LR_pred_df.Nominated.replace((1,0), ('yes', 'no'), inplace=True)
LR_pred_df

Unnamed: 0_level_0,Nominated
title,Unnamed: 1_level_1
Joker,yes
1917,yes
Once Upon a Time... in Hollywood,yes
The Irishman,yes
Parasite,yes
Jojo Rabbit,no
Little Women,yes
Mariage Story,yes


In [20]:
GF_LR_pred = model_GF_LR(G, tensor_data_std)[test_mask].argmax(dim=1)
GF_LR_pred_df = pd.DataFrame(data=GF_LR_pred, index=nodes_name.iloc[test_mask,0], columns=['Nominated'])
GF_LR_pred_df.Nominated.replace((1,0), ('yes', 'no'), inplace=True)
GF_LR_pred_df
#acc_test_logF, pre_test_logF, rec_test_logF, f1_test_logF, sup_test_logF, C_logF = evaluate(model_GF_LR, G, tensor_data_std, test_mask, tensor_labels)
#print("Precision {:.4%} | Recall {:.4%} | F1 {:.4%}". format(pre_test_logF, rec_test_logF, f1_test_logF))

Unnamed: 0_level_0,Nominated
title,Unnamed: 1_level_1
Joker,yes
1917,yes
Once Upon a Time... in Hollywood,yes
The Irishman,yes
Parasite,yes
Jojo Rabbit,yes
Little Women,yes
Mariage Story,yes


In [21]:
GCN_pred = model_GCN(G, tensor_data_std)[test_mask].argmax(dim=1)
GCN_pred_df = pd.DataFrame(data=GCN_pred, index=nodes_name.iloc[test_mask,0], columns=['Nominated'])
GCN_pred_df.Nominated.replace((1,0), ('yes', 'no'), inplace=True)
GCN_pred_df

Unnamed: 0_level_0,Nominated
title,Unnamed: 1_level_1
Joker,no
1917,yes
Once Upon a Time... in Hollywood,yes
The Irishman,yes
Parasite,yes
Jojo Rabbit,yes
Little Women,yes
Mariage Story,yes
