In [1]:
import numpy as np
from scipy.sparse import rand as sprand
import torch
import pandas as pd
import pickle as pkl

In [2]:
base_dir = '../Data/'

input_dir = base_dir + "data/train_data"
train_data = base_dir + "data/train_data/train_task_1_2.csv"
test_data_1 = base_dir + "data/test_data/test_public_answers_task_1.csv"

In [3]:
df = pd.read_csv(train_data)

In [4]:
rows = df['UserId'].values
cols = df['QuestionId'].values
data = df['IsCorrect'].values

In [3]:
# Make up some random explicit feedback ratings
# and convert to a numpy array
n_users = 120000
n_items = 28000

In [4]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        return torch.sum(self.user_factors(user) * self.item_factors(item))

In [5]:
model = MatrixFactorization(n_users, n_items, n_factors=20)

In [8]:
loss_func = torch.nn.MSELoss()

In [24]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)  # learning rate

In [123]:
print(len(rows))

15867850


In [124]:
n_epochs = 10

In [125]:
# rows, cols = ratings.nonzero()
p = np.random.permutation(len(rows))
rows, cols, data = rows[p], cols[p], data[p]
train_on = 10000
rows, cols, data = rows[:train_on], cols[:train_on], data[:train_on]

for i in range(n_epochs):
    epoch_loss = 0
    for row, col, d in zip(*(rows,cols,data)):
        # Set gradients to zero
        optimizer.zero_grad()

        # Turn data into tensors
        ans = torch.FloatTensor([d])
        row = torch.LongTensor([row])
        col = torch.LongTensor([col])

        # Predict and calculate loss
        prediction = model(row, col)
    #     print(prediction)
        loss = loss_func(prediction, ans)
        epoch_loss += loss

        # Backpropagate
        loss.backward()

        # Update the parameters
        optimizer.step()
    print(epoch_loss)

tensor(206834.9531, grad_fn=<AddBackward0>)
tensor(171390.9844, grad_fn=<AddBackward0>)
tensor(142705.9219, grad_fn=<AddBackward0>)
tensor(119313.7812, grad_fn=<AddBackward0>)
tensor(100118.2891, grad_fn=<AddBackward0>)
tensor(84282.6328, grad_fn=<AddBackward0>)
tensor(71159.1719, grad_fn=<AddBackward0>)
tensor(60239.6016, grad_fn=<AddBackward0>)
tensor(51121.8398, grad_fn=<AddBackward0>)
tensor(43483.8125, grad_fn=<AddBackward0>)


In [6]:
df_test = pd.read_csv(test_data_1)

In [7]:
rows = df_test['UserId'].values
cols = df_test['QuestionId'].values
data = df_test['IsCorrect'].values

In [8]:
print(model.user_factors)
print(model.item_factors)

print(model.user_factors(torch.LongTensor([1.0])))
print(model.user_factors(torch.LongTensor([5518.0])))
print(model.user_factors(torch.LongTensor([1.0])) * model.user_factors(torch.LongTensor([5518.0])))

torch.sum(model.user_factors(torch.LongTensor([1.0])) * model.user_factors(torch.LongTensor([5518.0])))

Embedding(120000, 20, sparse=True)
Embedding(28000, 20, sparse=True)
tensor([[-1.9246,  0.9071, -0.1746,  0.3423, -0.4570,  1.0863,  0.0396, -0.0787,
         -0.3062,  2.0754, -0.4159, -0.5939,  1.5841,  0.3556, -0.6180,  0.8134,
         -0.5118,  1.3355, -1.1820, -0.1426]], grad_fn=<EmbeddingBackward>)
tensor([[ 1.1567, -2.7684, -0.6423, -0.9644,  0.5254,  1.0724,  0.3142,  2.4482,
         -0.3979,  1.1756,  0.4415, -0.4369, -0.9037, -1.3411, -2.2563,  2.2586,
         -0.7399, -0.0851,  0.6647, -0.2716]], grad_fn=<EmbeddingBackward>)
tensor([[-2.2261, -2.5111,  0.1121, -0.3301, -0.2401,  1.1649,  0.0125, -0.1926,
          0.1218,  2.4399, -0.1836,  0.2595, -1.4316, -0.4769,  1.3944,  1.8372,
          0.3787, -0.1137, -0.7857,  0.0387]], grad_fn=<MulBackward0>)


tensor(-0.7318, grad_fn=<SumBackward0>)

In [9]:
q_path = "../weights/simple_fact_q_best_cpu.pkl"

with open(q_path,'rb') as f:
    model.item_factors = pkl.load(f)

In [10]:
u_path = "../weights/simple_fact_u_best_cpu.pkl"

with open(u_path,'rb') as f:
    model.user_factors = pkl.load(f)

In [11]:
print(model.item_factors.num_embeddings)
print(model.user_factors.num_embeddings)

28000
119000


In [12]:
len(rows)

1983481

In [13]:
test_on = 100000
rows, cols, data = rows[:test_on], cols[:test_on], data[:test_on]

In [21]:
conf_matrix = [[0,0],[0,0]]

for row, col, d in zip(*(rows,cols,data)):
    
    # Turn data into tensors
    ans = torch.FloatTensor([d])
    row = torch.LongTensor([row])
    col = torch.LongTensor([col])

    # Predict and calculate loss
#     print(ans,row,col)
    prediction = model(row, col)
#     print(prediction)
    if(prediction > 0) :
        pred = 1
    else :
        pred = 0
    
#     print(ans)
    conf_matrix[pred][int(ans)] += 1

    # Backpropagate
#     loss.backward()

    # Update the parameters
#     optimizer.step()

In [22]:
conf_matrix

[[17837, 32292], [17593, 32278]]