In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from utils import build_dense_graph

In [3]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}

DATA_PATH = '/opt/ml/input/data/'
train = pd.read_csv(DATA_PATH + 'train_data.csv', dtype=dtype, parse_dates=['Timestamp'])
train = train.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
test = pd.read_csv(DATA_PATH + 'test_data.csv', dtype=dtype, parse_dates=['Timestamp'])
test = test.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [13]:
train['KTag'], _ = pd.factorize(train['KnowledgeTag'], sort=True)
test['KTag'], _ = pd.factorize(test['KnowledgeTag'], sort=True)

In [16]:
train.loc[train['KTag'] == 0]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,KTag
464,0,A080078003,A080000078,0,2020-08-29 00:25:36,23,0
465,0,A080078004,A080000078,0,2020-08-29 00:25:40,23,0
466,0,A080078005,A080000078,0,2020-08-29 00:26:05,23,0
469,0,A080078008,A080000078,0,2020-08-29 00:26:11,23,0
2364,5,A080077003,A080000077,1,2020-08-21 19:52:36,23,0
...,...,...,...,...,...,...,...
2262760,7215,A080077007,A080000077,1,2020-08-31 06:31:40,23,0
2263669,7266,A080078008,A080000078,0,2020-08-12 06:01:36,23,0
2263670,7266,A080078003,A080000078,0,2020-08-12 06:06:06,23,0
2263672,7266,A080078004,A080000078,0,2020-08-12 06:08:09,23,0


In [19]:
pad_sequence([torch.LongTensor([1, 2]), torch.LongTensor([1, 2, 3])], batch_first=True, padding_value=-1)

tensor([[ 1,  2, -1],
        [ 1,  2,  3]])

In [2]:
torch.eye(3)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [7]:
torch.cat((torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 2], [3, 4]])), dim=1)

tensor([[1, 2, 1, 2],
        [3, 4, 3, 4]])

In [14]:
embedding = torch.nn.Embedding(10, 3, padding_idx=-1)
input = torch.LongTensor([[0, 9, 0, 5]])
print(embedding(input))

tensor([[[ 0.6195,  1.1019,  0.7366],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.6195,  1.1019,  0.7366],
         [-0.1482,  1.4594, -1.5008]]], grad_fn=<EmbeddingBackward0>)


### Assisment data

In [30]:
file_path = 'data/assistment_test15.csv'
df = pd.read_csv(file_path)
if "skill_id" not in df.columns:
    raise KeyError(f"The column 'skill_id' was not found on {file_path}")
if "correct" not in df.columns:
    raise KeyError(f"The column 'correct' was not found on {file_path}")
if "user_id" not in df.columns:
    raise KeyError(f"The column 'user_id' was not found on {file_path}")

In [31]:
df['user_id'].value_counts()

70740    445
70363    413
70699    309
71179    190
70684    174
70709     90
71066     82
80119     53
71205     32
78091     29
54318     24
64532     18
71215     17
52613      7
84381      4
Name: user_id, dtype: int64

In [32]:
df = df.loc[df['skill_id'].notna()]

In [25]:
df['skill_id'].nunique()

74

In [29]:
df['user_id'].nunique()

15

In [33]:
df['user_id'].value_counts()

70740    368
70363    326
70699    226
70684    161
71179    149
70709     83
71066     62
80119     47
71205     32
78091     27
54318     24
64532     18
71215     15
52613      7
84381      4
Name: user_id, dtype: int64

In [20]:
df['user_id'].nunique()

15

In [1]:
a = [2, 3]
b = [4]
c = a + b

print(a)
print(b)
print(c)

[2, 3]
[4]
[2, 3, 4]
