In [None]:
%pdb

In [2]:
from __future__ import print_function
from __future__ import division
import torch
import sys
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import copy
from torch import autograd
from torch.autograd import Variable
import scipy.misc
from sklearn.metrics import confusion_matrix
from torch.utils.data import Dataset, DataLoader
from torchvision.utils import make_grid
import pickle as pkl
print("PyTorch Version: ",torch.__version__)
from tqdm import tqdm
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy

PyTorch Version:  1.5.1


In [3]:
class MatrixFactorization(pl.LightningModule):

    def __init__(self,total_q,total_s,K=100, lr=1e-3):
        super(MatrixFactorization, self).__init__()
        self.K = K
        self.lr = lr
        self.Q = torch.nn.Embedding(total_q,self.K)
        self.U = torch.nn.Embedding(total_s,self.K)
    
    def get_qvector(self,questions):
        ans = self.Q(torch.LongTensor(questions))
        return ans
    
    def get_uvector(self,users):
        ans = self.U(torch.LongTensor(users))
        return ans

    def get_score(self,qvectors,uvectors):
        q_unsq = torch.unsqueeze(qvectors, 1)
        u_unsq = torch.unsqueeze(uvectors, 2)
        score = torch.bmm(q_unsq,u_unsq)
        score = torch.squeeze(score)
        return score
    
    def forward(self, questions,users):
        return self.get_score( self.get_qvector(questions), self.get_uvector(users) )

    def training_step(self, batch, batch_nb):
        questions, users, answers = batch
        criterion = nn.MSELoss()
        loss = criterion(self(questions,users), answers)
        return loss
        
    def validation_step(self, batch, batch_idx):
        questions, users, answers = batch
        scores = self(questions,users)
        criterion = nn.MSELoss()
        loss = criterion(scores, answers)
        preds = torch.sign(scores)
        acc = accuracy(preds, answers)

        # Calling self.log will surface up scalars for you in TensorBoard
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        # Here we just reuse the validation_step for testing
        return self.validation_step(batch, batch_idx)
    
    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), self.lr)

In [4]:
class Question_Ans(Dataset):
    def __init__(self, df, mode='train'):
        self.df = df
        self.questionid = self.df['QuestionId'].values
        self.userid = self.df['UserId'].values
        self.ans = self.df['IsCorrect'].values
        
        self.ans = 2*self.ans - 1
        self.length=len(self.ans)
        
        if(mode=='train'):
            start=int(0*self.length)
            end=int(0.8*self.length)
        elif(mode=='val'):
            start=int(0.8*self.length)
            end=int(1*self.length)
        else:
            start = 0
            end = self.length
            
        self.questionid = self.questionid[start:end]
        self.userid = self.userid[start:end]
        self.ans = self.ans[start:end]
        self.length=len(self.ans)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        qid = self.questionid[idx]
        uid = self.userid[idx]
        ans = self.ans[idx]
        return qid,uid,ans

In [5]:
data_file = "../Data/data/train_data/train_task_1_2.csv"
batch_size = 32
lr = 0.0001
num_epochs = 5
K = 100
total_q = 28000
total_s = 119000

In [6]:
df = pd.read_csv(data_file)

In [7]:
train_dataset = Question_Ans(df,mode='train')
val_dataset = Question_Ans(df,mode='val')

train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset,batch_size=batch_size, shuffle=False)

In [8]:
# Init our model
my_model = MatrixFactorization(total_q,total_s,K,lr)

# Initialize a trainer
# trainer = pl.Trainer(gpus=None, max_epochs=num_epochs, progress_bar_refresh_rate=20, default_root_dir='./weights/pytl_grad_descent/')
trainer = pl.Trainer(gpus=None, max_epochs=num_epochs, progress_bar_refresh_rate=20)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores


In [None]:
# Train the model ⚡
trainer.fit(my_model, train_dataloader, val_dataloader)


  | Name | Type      | Params
-----------------------------------
0 | Q    | Embedding | 2.8 M 
1 | U    | Embedding | 11.9 M
-----------------------------------
14.7 M    Trainable params
0         Non-trainable params
14.7 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

RuntimeError: index -1 is out of bounds for dimension 0 with size 3

> [0;32m/Users/ayushd/opt/anaconda3/envs/pytl/lib/python3.7/site-packages/pytorch_lightning/metrics/functional/classification.py[0m(198)[0;36mstat_scores_multiple_classes[0;34m()[0m
[0;32m    196 [0;31m        [0mmatch_false[0m [0;34m=[0m [0;36m1[0m [0;34m-[0m [0mmatch_true[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    197 [0;31m[0;34m[0m[0m
[0m[0;32m--> 198 [0;31m        [0mtps[0m[0;34m.[0m[0mscatter_add_[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0mpred[0m[0;34m,[0m [0mmatch_true[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    199 [0;31m        [0mfps[0m[0;34m.[0m[0mscatter_add_[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0mpred[0m[0;34m,[0m [0mmatch_false[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    200 [0;31m        [0mfns[0m[0;34m.[0m[0mscatter_add_[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0mtarget[0m[0;34m,[0m [0mmatch_false[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
