In [114]:
import numpy as np
from scipy.sparse import rand as sprand
import torch
import pandas as pd

In [115]:
base_dir = '../Data/'

input_dir = base_dir + "data/train_data"
train_data = base_dir + "data/train_data/train_task_1_2.csv"
test_data_1 = base_dir + "data/test_data/test_public_answers_task_1.csv"

In [116]:
df = pd.read_csv(train_data)

In [117]:
rows = df['UserId'].values
cols = df['QuestionId'].values
data = df['IsCorrect'].values

In [118]:
# Make up some random explicit feedback ratings
# and convert to a numpy array
n_users = 120000
n_items = 28000

In [119]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        return torch.sum(self.user_factors(user) * self.item_factors(item))

In [120]:
model = MatrixFactorization(n_users, n_items, n_factors=20)

In [121]:
loss_func = torch.nn.MSELoss()

In [122]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)  # learning rate

In [123]:
print(len(rows))

15867850


In [124]:
n_epochs = 10

In [125]:
# rows, cols = ratings.nonzero()
p = np.random.permutation(len(rows))
rows, cols, data = rows[p], cols[p], data[p]
train_on = 10000
rows, cols, data = rows[:train_on], cols[:train_on], data[:train_on]

for i in range(n_epochs):
    epoch_loss = 0
    for row, col, d in zip(*(rows,cols,data)):
        # Set gradients to zero
        optimizer.zero_grad()

        # Turn data into tensors
        ans = torch.FloatTensor([d])
        row = torch.LongTensor([row])
        col = torch.LongTensor([col])

        # Predict and calculate loss
        prediction = model(row, col)
    #     print(prediction)
        loss = loss_func(prediction, ans)
        epoch_loss += loss

        # Backpropagate
        loss.backward()

        # Update the parameters
        optimizer.step()
    print(epoch_loss)

tensor(206834.9531, grad_fn=<AddBackward0>)
tensor(171390.9844, grad_fn=<AddBackward0>)
tensor(142705.9219, grad_fn=<AddBackward0>)
tensor(119313.7812, grad_fn=<AddBackward0>)
tensor(100118.2891, grad_fn=<AddBackward0>)
tensor(84282.6328, grad_fn=<AddBackward0>)
tensor(71159.1719, grad_fn=<AddBackward0>)
tensor(60239.6016, grad_fn=<AddBackward0>)
tensor(51121.8398, grad_fn=<AddBackward0>)
tensor(43483.8125, grad_fn=<AddBackward0>)


In [126]:
df_test = pd.read_csv(test_data_1)

In [143]:
rows = df_test['UserId'].values
cols = df_test['QuestionId'].values
data = df_test['IsCorrect'].values

In [144]:
print(model.user_factors)
print(model.item_factors)

print(model.user_factors(torch.LongTensor([1.0])))
print(model.user_factors(torch.LongTensor([5518.0])))
print(model.user_factors(torch.LongTensor([1.0])) * model.user_factors(torch.LongTensor([5518.0])))

torch.sum(model.user_factors(torch.LongTensor([1.0])) * model.user_factors(torch.LongTensor([5518.0])))

Embedding(120000, 20, sparse=True)
Embedding(28000, 20, sparse=True)
tensor([[ 0.0966, -0.4210,  0.2270,  0.0751, -0.8795,  0.5202, -0.5232,  0.7825,
         -1.7775,  0.9210,  0.4521, -0.0296, -1.6099, -0.7808, -0.0815,  1.3386,
          1.3659, -0.3038,  0.9441, -1.9193]], grad_fn=<EmbeddingBackward>)
tensor([[ 0.9906,  0.4780, -2.9694, -0.5586,  1.8096, -0.5160, -0.4232,  0.4326,
          0.7676,  1.2132,  1.2593,  0.3399, -0.7448, -0.3514,  0.0400, -0.1369,
         -0.0539,  0.2454,  0.2397, -0.4897]], grad_fn=<EmbeddingBackward>)
tensor([[ 0.0957, -0.2012, -0.6739, -0.0420, -1.5916, -0.2684,  0.2214,  0.3385,
         -1.3644,  1.1173,  0.5693, -0.0100,  1.1990,  0.2744, -0.0033, -0.1832,
         -0.0736, -0.0745,  0.2263,  0.9399]], grad_fn=<MulBackward0>)


tensor(0.4958, grad_fn=<SumBackward0>)

In [145]:
len(rows)

1983481

In [146]:
test_on = 1000000
rows, cols, data = rows[:test_on], cols[:test_on], data[:test_on]

In [147]:
conf_matrix = [[0,0],[0,0]]

for row, col, d in zip(*(rows,cols,data)):
    # Set gradients to zero
    optimizer.zero_grad()
    
    # Turn data into tensors
    ans = torch.FloatTensor([d])
    row = torch.LongTensor([row])
    col = torch.LongTensor([col])

    # Predict and calculate loss
#     print(ans,row,col)
    prediction = model(row, col)
#     print(prediction)
    if(prediction > 0.5) :
        pred = 1
    else :
        pred = 0
    
#     print(ans)
    conf_matrix[pred][int(ans)] += 1

    # Backpropagate
#     loss.backward()

    # Update the parameters
#     optimizer.step()

In [148]:
conf_matrix

[[195027, 351397], [161612, 291964]]