### Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import copy

import csv
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os

# Functions

### Loading Data

Implementation of the function that allows to load the data in the csv files:

In [2]:
def load_data_matrix(path, room_size, padding, No_choice):
    """
    Returns two tensors, that contains the inputs and the outputs.
    The matrix contains 1 for every available seat, and 0 otherwise. 
    
    path: A string that contains the name of the file
    room_size: the size of one side the room (the biggest possible if various size)
    padding: the padding needed if various size
    No_choice: True if there is the possible to choose no seat
    """
    
    # Opening file:
    csvfile = open(path, "r")
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # Initialization of arrays:
    inputs=[]
    outputs=[]

    # Taking the header and the index of useful columns:
    header = next(reader) 
    
    ind_ncols = header.index('Cond_NCol')
    ind_nrows = header.index('Cond_NRows')
    ind_row_available = header.index('RowNumber_Avail')
    ind_col_available = header.index('ColNumber_Avail')
    ind_chosen = header.index('Chosen')
    
    
    previous, nb_inputs = (-1,-1,-1), 0
    for row in reader:
        if previous != (int(row[0]),int(row[1]),int(row[2])):
            nb_inputs += 1
            previous = (int(row[0]),int(row[1]),int(row[2]))
            
            new_input = [[0 for _ in range(int(row[ind_ncols]))] for _ in range(int(row[ind_nrows]))]
            inputs.append(new_input)
            
            new_output = [[0 for _ in range(int(row[ind_ncols]))] for _ in range(int(row[ind_nrows]))]
            outputs.append(new_output)
                        
        inputs[-1][int(row[ind_row_available])-1][int(row[ind_col_available])-1] = 1
        
        # Update the output with the chosen place, except if it's on (0,0), which signify no place chosen:
        if(int(row[ind_chosen])==1) and int(row[ind_row_available])!= 0:
            outputs[-1][int(row[ind_row_available])-1][int(row[ind_col_available])-1] = 1

    print("Load", nb_inputs, "examples as matrices \n")
    
    torch_inputs = []
    torch_outputs = []

    if padding != 0:
        
        pad1 = nn.ZeroPad2d((padding,padding,padding,padding))
        pad2 = nn.ZeroPad2d((padding,padding,0,0))
        pad3 = nn.ZeroPad2d((0,0,padding,padding))
        
        old_room_size = room_size - 2*padding

        for i in range(len(inputs)):
            # Applying paddings in function of the room size :
            
            if len(inputs[i]) == old_room_size:
                if len(inputs[i][0]) == old_room_size: # Padding on both directions
                    torch_inputs.append(pad1(torch.Tensor(inputs[i])).view(room_size,room_size,1))
                    torch_outputs.append(pad1(torch.Tensor(outputs[i])).view(-1))   
                    
                else: # Padding only on the rows
                    torch_inputs.append(pad3(torch.Tensor(inputs[i])).view(room_size,room_size,1))
                    torch_outputs.append(pad3(torch.Tensor(outputs[i])).view(-1))     
                    
            elif len(inputs[i][0]) == old_room_size: # Padding only on the columns
                torch_inputs.append(pad2(torch.Tensor(inputs[i])).view(room_size,room_size,1))
                torch_outputs.append(pad2(torch.Tensor(outputs[i])).view(-1))     
                
            else: # No padding
                torch_inputs.append(torch.Tensor(inputs[i]).view(room_size,room_size,1))
                torch_outputs.append(torch.Tensor(outputs[i]).view(-1))
                
        torch_inputs, torch_outputs = torch.stack(torch_inputs), torch.stack(torch_outputs)
    
    else:
        torch_inputs = torch.Tensor(inputs).view(nb_inputs,int(room_size),int(room_size),1)
        torch_outputs = torch.Tensor(outputs).view(nb_inputs,room_size*room_size)
        
    if No_choice:
        no_choice_output = torch.Tensor([1 if torch.sum(o)== 0 else 0 for o in torch_outputs]).view(-1,1)
        torch_outputs = torch.cat((torch_outputs, no_choice_output), dim=1)
    
    return torch_inputs, torch_outputs

### Implementation of CNN :

The architecture that we choose to implement is a CNN, with convolutionnal layers and a fully connected layer at the end.
We also apply a mask after the linear layer, in order to do a prediction only on the available seats.

For the convolutionnal layers, we use 3x3 kernels with padding of 1 to keep the same dimension. 
The Number of layers and the number of channels (that are the same on each layers) are hyperparameters that can be changed. 

In [3]:
class CNN(nn.Module):
    def __init__(self, room_size, nb_channels, nb_conv_layers, No_choice):
        
        super(CNN, self).__init__()
        
        self.room_size = room_size
        self.nb_channels = nb_channels
        self.nb_conv_layers = nb_conv_layers
        
        self.output_size = self.room_size*self.room_size
        self.No_choice = No_choice
        
        if No_choice:
            self.output_size = self.room_size*self.room_size+1
        
        self.conv_layers = nn.ModuleList([nn.Sequential(nn.Conv2d(in_channels=1, out_channels=self.nb_channels, 
                                                                 kernel_size=3, stride=1, padding=1),
                                                       nn.BatchNorm2d(self.nb_channels))])
        
        if self.nb_conv_layers > 1:
            self.conv_layers.extend([nn.Sequential(nn.Conv2d(in_channels=self.nb_channels, out_channels=self.nb_channels, 
                                                                 kernel_size=3, stride=1, padding=1),
                                                   nn.BatchNorm2d(self.nb_channels))
                                     for i in range(self.nb_conv_layers-1)])
            
        self.fc1 = nn.Linear(self.nb_channels*self.room_size*self.room_size, self.output_size)
        
    def forward(self, input, mask):
        x = input.transpose(1,3)
        for l in self.conv_layers:
            x = l(x)
        x = self.fc1(x.view(-1,self.nb_channels*self.room_size*self.room_size))
        x = x * mask
        return x
    
    def predict(self, x):
        predictions = [] 
        with torch.no_grad():
            for inputs in x:
                inputs = inputs.to(device)
                
                masks = inputs.view(-1,self.room_size*self.room_size)
                if self.No_choice:
                    masks = torch.cat((masks, torch.ones(masks.shape[0],1).to(device)), dim=1)
                masks = masks.to(device)
                
                outputs = self(inputs, masks)
                predictions.extend(outputs)
        return np.asarray(predictions).reshape(-1)

    def evaluate(self, loader):
        with torch.no_grad():
            correct1, correct5 = 0, 0
            for inputs, labels in loader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                masks = inputs.view(-1,self.room_size*self.room_size)
                if self.No_choice:
                    masks = torch.cat((masks, torch.ones(masks.shape[0],1).to(device)), dim=1)
                masks = masks.to(device)
                
                output = self(inputs, masks)
                labels = torch.max(labels, 1)[1]
                top5 = torch.sort(output, dim = 1, descending = True)[1][:,0:4]
                ci1 = 0
                ci5 = 0
                for i in range(labels.shape[0]):
                    if labels[i] == top5[i,0]:
                        ci1 += 1
                    if labels[i] in top5[i]:
                        ci5 += 1
                correct1 += ci1/labels.shape[0]
                correct5 += ci5/labels.shape[0]
            return correct1/len(loader), correct5/len(loader)
        
    def train(self, train_set, valid_set, 
              patience = 10, max_it = 10000, verbose = True):
        
        counter, train_loss, val_loss = 0, None, None
        best_val_acc5, best_val_acc1 = -1, -1
        history = [-1, -1, -1]
        
        if verbose:
            print("{:5s} | {:10s} | {:5s} | {:5s}".format(
                "epoch", "train_loss", "top1", "top5"))
        for epoch in range(max_it):
            running_loss = 0
            # early stopping
            counter += 1
            if counter > patience - 1:
                break
            for i, data in enumerate(train_set, 1):
                # get the inputs
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                index_labels = torch.max(labels, 1)[1].to(device)
                
                inputs, labels = inputs.to(device), labels.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward + backward + optimize
                
                masks = inputs.view(-1,self.room_size*self.room_size)
                if self.No_choice:
                    masks = torch.cat((masks, torch.ones(masks.shape[0],1).to(device)), dim=1)
                masks = masks.to(device)                
                outputs = self.forward(inputs, masks)

                train_loss = loss_fn(outputs, index_labels)
                running_loss += train_loss.item()
                train_loss.backward()
                optimizer.step()
                
            val_acc1, val_acc5 = self.evaluate(valid_set)
            #train_acc5 = self.evaluate(train_set)[1]
            if verbose:
                print(
                    "{:5d} | {:10.5f} | {:5.2%} | {:5.2%}".format(
                        epoch, running_loss / len(trainloader_mat),
                        val_acc1, val_acc5),
                    end="")
            if val_acc5 > best_val_acc5:
                counter = 0
                best_val_acc5 = val_acc5
                torch.save(self.state_dict(), "best_model")
                history[0] = running_loss / len(train_set)
                history[1] = val_acc1
                history[2] = val_acc5
                if verbose:
                    print("\tsaved!", end="")
            if verbose:
                print("")
        self.load_state_dict(torch.load("best_model"))
        return history

#### Implementation of a function that allows to treat the case when a pair of seats has to be predicted:

In [4]:
def keep_left_seat(inputs,outputs,room_size,No_choice):
    update_inputs = copy.deepcopy(inputs.view(-1,room_size,room_size))
    
    if No_choice:
        nc = outputs[:,-1].view(-1,1)
        update_outputs = copy.deepcopy(outputs[:,:-1].view(-1,room_size,room_size))
    else:
        update_outputs = copy.deepcopy(outputs.view(-1,room_size,room_size))
        
    update_inputs = update_inputs * torch.cat((update_inputs[:,:,1:], torch.zeros(update_inputs.shape[0],room_size,1)),2)
    
    update_outputs = update_outputs * torch.cat((update_outputs[:,:,1:], torch.zeros(update_outputs.shape[0],room_size,1)),2)

    update_inputs = update_inputs.view(-1,room_size,room_size,1)
    update_outputs = update_outputs.view(-1,room_size*room_size)
    
    if No_choice:
        update_outputs = torch.cat((update_outputs, nc),1)
    
    return update_inputs, update_outputs

# Study of datasets

### Informations of the data

For each file, we need its name, and also, the size of the room, the necessary padding, and we also need to know if there's a no-choice option and if it's pairs of seats.

To study one file, we just have to choose the corresponding index on the array.

In [5]:
path = "Studies ALL - ML and ANALYSIS/"

files_train = ["PS_ConcertData_Study2_FlatFile_INSAMPLE.csv", 
               "PS_Movie_Singles_Study4_CF_24ch_ForcedChoices_INSAMPLE_FlatFileForML.csv", 
               "PS_Movie_Singles_Study4_CF_24ch_withNonChoice_INSAMPLE_FlatFileForML.csv",
               "PS_Movie_Singles_Study4_NCF_24ch_ForcedChoices_INSAMPLE_FlatFileForML.csv",
               "PS_Movie_Singles_Study4_NCF_24ch_withNonChoice_INSAMPLE_FlatFileForML.csv",
               "PS_Concert_Couple_Study3_FC_INSAMPLE_FlatFileForML.csv", 
               "PS_Concert_Couple_Study3_NC_INSAMPLE_FlatFileForML.csv",
               "PS_Movie_Couple_Study5_FC_3032ch_INSAMPLE_FlatFileForML.csv",
               "PS_Movie_Couple_Study5_NC_3032ch_INSAMPLE_FlatFileForML.csv",
               "PS_Movie_Couple_Study5_NC_3032ch_INSAMPLE_75density_FlatFileForML.csv",
              ]

files_valid = ["PS_ConcertData_Study2_FlatFile_HOLDOUT.csv",
               "PS_Movie_Singles_Study4_CF_24ch_ForcedChoices_HOLDOUT_FlatFileForML.csv", 
               "PS_Movie_Singles_Study4_CF_24ch_withNonChoice_HOLDOUT_FlatFileForML.csv",
               "PS_Movie_Singles_Study4_NCF_24ch_ForcedChoices_HOLDOUT_FlatFileForML.csv",
               "PS_Movie_Singles_Study4_NCF_24ch_withNonChoice_HOLDOUT_FlatFileForML.csv",
               "PS_Concert_Couple_Study3_FC_HOLDOUT_FlatFileForML.csv", 
               "PS_Concert_Couple_Study3_NC_HOLDOUT_FlatFileForML.csv",
               "PS_Movie_Couple_Study5_FC_3032ch_HOLDOUT_FlatFileForML.csv",
               "PS_Movie_Couple_Study5_NC_3032ch_HOLDOUT_FlatFileForML.csv",
               "PS_Movie_Couple_Study5_NC_3032ch_HOLDOUT_75density_FlatFileForML.csv"
              ]

room_size_list = [20, 12, 12, 12, 12, 20, 20, 12, 12, 12]

padding_list = [5, 0, 0, 0, 0, 4, 4, 0, 0, 0]

no_choice_list = [False, False, True, False, True, False, True, False, True, True]

couple_list = [False, False, False, False, False, True, True, True, True, True]

### Hyperparameters

We can change here the file to study, and also the hyperparameters of the CNN.

In [6]:
# Index of the file to study:
ind_file = 9

print("INSAMPLE : ", files_train[ind_file])
print("HOLDOUT : ", files_valid[ind_file])
print("ROOM SIZE : ", room_size_list[ind_file])
print("PADDING : ", padding_list[ind_file])
print("NO CHOICE OPTION : ", no_choice_list[ind_file])
print("PAIRS OF SEATS : ", couple_list[ind_file])


# Number of convolutional layers:
nb_channels = 1

# Number of channels for each convolutional layers:
nb_conv_layers = 1

# Batch size for the Neural Network:
batch_size = 32

# Learning rate for ADAM optimizer:
lr_opt = 1e-4

INSAMPLE :  PS_Movie_Couple_Study5_NC_3032ch_INSAMPLE_75density_FlatFileForML.csv
HOLDOUT :  PS_Movie_Couple_Study5_NC_3032ch_HOLDOUT_75density_FlatFileForML.csv
ROOM SIZE :  12
PADDING :  0
NO CHOICE OPTION :  True
PAIRS OF SEATS :  True


### Data loading

In [7]:
print("INSAMPLE : ", files_train[ind_file])

x_train_mat, y_train_mat = load_data_matrix(path = path+files_train[ind_file], 
                                            room_size = room_size_list[ind_file], 
                                            padding = padding_list[ind_file],
                                            No_choice = no_choice_list[ind_file])

print("HOLDOUT : ", files_valid[ind_file])
x_valid_mat, y_valid_mat = load_data_matrix(path=path+files_valid[ind_file], 
                                            room_size = room_size_list[ind_file], 
                                            padding = padding_list[ind_file],
                                            No_choice = no_choice_list[ind_file])
if couple_list[ind_file]:
    x_train_mat, y_train_mat = keep_left_seat(x_train_mat, y_train_mat,
                                              room_size = room_size_list[ind_file],
                                              No_choice = no_choice_list[ind_file])
    x_valid_mat, y_valid_mat = keep_left_seat(x_valid_mat, y_valid_mat,
                                              room_size = room_size_list[ind_file],
                                              No_choice = no_choice_list[ind_file])

train_mat = torch.utils.data.TensorDataset(x_train_mat, y_train_mat)
trainloader_mat = torch.utils.data.DataLoader(train_mat, batch_size=batch_size, shuffle=True)

valid_mat = torch.utils.data.TensorDataset(x_valid_mat, y_valid_mat)
validloader_mat = torch.utils.data.DataLoader(valid_mat, batch_size=batch_size, shuffle=False)

INSAMPLE :  PS_Movie_Couple_Study5_NC_3032ch_INSAMPLE_75density_FlatFileForML.csv
Load 3004 examples as matrices 

HOLDOUT :  PS_Movie_Couple_Study5_NC_3032ch_HOLDOUT_75density_FlatFileForML.csv
Load 699 examples as matrices 



In [8]:
model = CNN(room_size = room_size_list[ind_file], 
            nb_channels = nb_channels, 
            nb_conv_layers = nb_conv_layers,
            No_choice = no_choice_list[ind_file])

optimizer = torch.optim.Adam(model.parameters(), lr=lr_opt)
loss_fn = nn.CrossEntropyLoss()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

history_cnn = model.train(trainloader_mat, validloader_mat)

epoch | train_loss | top1  | top5 
    0 |    4.96272 | 11.33% | 42.33%	saved!
    1 |    4.91753 | 11.89% | 43.64%	saved!
    2 |    4.87172 | 12.46% | 44.21%	saved!
    3 |    4.82420 | 13.06% | 45.77%	saved!
    4 |    4.77370 | 12.92% | 47.67%	saved!
    5 |    4.72091 | 13.94% | 48.64%	saved!
    6 |    4.66422 | 15.07% | 49.49%	saved!
    7 |    4.60537 | 15.36% | 51.08%	saved!
    8 |    4.54292 | 16.52% | 51.79%	saved!
    9 |    4.47745 | 17.23% | 52.76%	saved!
   10 |    4.40779 | 17.09% | 54.60%	saved!
   11 |    4.33503 | 17.37% | 55.03%	saved!
   12 |    4.25802 | 18.22% | 56.33%	saved!
   13 |    4.17479 | 18.79% | 56.62%	saved!
   14 |    4.08567 | 19.50% | 57.61%	saved!
   15 |    3.98881 | 20.24% | 57.67%	saved!
   16 |    3.88205 | 20.81% | 58.94%	saved!
   17 |    3.76496 | 20.98% | 60.11%	saved!
   18 |    3.63875 | 20.55% | 61.55%	saved!
   19 |    3.50325 | 22.09% | 63.59%	saved!
   20 |    3.35959 | 22.40% | 65.01%	saved!
   21 |    3.21267 | 22.82% | 65.18%	save

### Training :

# Results

In [9]:
top1_train, top5_train= model.evaluate(trainloader_mat)
top1_valid, top5_valid = model.evaluate(validloader_mat)

print("Top 1 prediction for train set (in %): ", top1_train)
print("Top 5 prediction for train set (in %): ", top5_train)

print("\nTop 1 prediction for valid set (in %): ", top1_valid)
print("Top 5 prediction for valid set (in %): ", top5_valid)

Top 1 prediction for train set (in %):  0.4178381458966565
Top 5 prediction for train set (in %):  0.9244395896656535

Top 1 prediction for valid set (in %):  0.25547138047138046
Top 5 prediction for valid set (in %):  0.70864898989899
