In [3]:
import numpy as np
import pandas as pd
from config import Config
from sklearn.model_selection import train_test_split

In [4]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}

train = pd.read_csv(Config.TRAIN_FILE, dtype=dtype, parse_dates=['Timestamp'])
train = train.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
test = pd.read_csv(Config.TEST_FILE, dtype=dtype, parse_dates=['Timestamp'])
test = test.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [5]:
train_df = pd.read_csv(Config.TRAIN_FILE, usecols=[
                        0, 1, 3, 4, 5], dtype=dtype, parse_dates=['Timestamp'])
test_df = pd.read_csv(Config.TEST_FILE, usecols=[
                        0, 1, 3, 4, 5], dtype=dtype, parse_dates=['Timestamp'])


In [7]:
train_df['userID'].value_counts()

730     1860
481     1847
1112    1777
394     1774
926     1773
        ... 
7396      14
7414      13
7390      13
7252      13
7441       9
Name: userID, Length: 6698, dtype: int64

In [16]:
test_df.loc[test_df['answerCode'] != -1]

Unnamed: 0,userID,assessmentItemID,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,1,2020-01-09 10:56:31,2626
1,3,A050023002,1,2020-01-09 10:56:57,2626
2,3,A050023003,0,2020-01-09 10:58:31,2625
3,3,A050023004,0,2020-01-09 10:58:36,2625
4,3,A050023006,0,2020-01-09 10:58:43,2623
...,...,...,...,...,...
260108,7439,A040197006,1,2020-08-21 07:39:45,2132
260109,7439,A040130001,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,1,2020-10-14 23:08:02,8244


In [7]:
pd.concat([train_df, test_df]).reset

Unnamed: 0,userID,assessmentItemID,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,1,2020-03-24 00:17:11,7224
1,0,A060001002,1,2020-03-24 00:17:14,7225
2,0,A060001003,1,2020-03-24 00:17:22,7225
3,0,A060001004,1,2020-03-24 00:17:29,7225
4,0,A060001005,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...
260109,7439,A040130001,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,1,2020-10-14 23:09:31,8244


In [12]:
train_split, valid_split = train_test_split(train_df, test_size=0.1)

In [13]:
train_split

Unnamed: 0,userID,assessmentItemID,answerCode,Timestamp,KnowledgeTag
432861,604,A070120007,0,2020-09-01 23:12:13,9660
1975772,4056,A050126003,1,2020-09-23 05:53:58,5289
2063263,4527,A030167001,1,2020-10-20 23:25:39,1727
715769,1034,A020086010,0,2020-05-08 03:09:42,7938
1248716,1978,A080047006,0,2020-06-17 22:20:27,4804
...,...,...,...,...,...
1218647,1913,A030013004,0,2020-04-17 12:12:14,7308
2240917,6513,A070001002,1,2020-01-21 05:45:53,608
1278797,2051,A020090002,1,2020-07-30 12:04:57,7944
2070582,4575,A030007005,1,2020-08-11 08:24:54,307


In [14]:
valid_split

Unnamed: 0,userID,assessmentItemID,answerCode,Timestamp,KnowledgeTag
2020327,4270,A060170004,0,2020-10-23 14:19:48,1494
2017970,4262,A050177006,0,2020-08-28 05:46:36,5544
1148513,1775,A080051006,0,2020-05-27 04:51:01,4959
664302,952,A020124002,1,2020-09-25 04:44:24,8018
814119,1185,A010062002,0,2020-08-06 06:45:54,6453
...,...,...,...,...,...
1762474,3235,A020096004,1,2020-07-14 11:10:28,7944
1394227,2296,A050073005,0,2020-08-06 12:03:41,3828
18710,25,A070128004,0,2020-11-17 05:53:59,9065
394039,539,A070005006,1,2020-04-27 21:19:06,3794


In [7]:
set(test['KnowledgeTag']) - set(train['KnowledgeTag'])

set()

In [8]:
train['KnowledgeTag'].nunique()

912

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266586 entries, 0 to 2266585
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userID            int16         
 1   assessmentItemID  object        
 2   testId            object        
 3   answerCode        int8          
 4   Timestamp         datetime64[ns]
 5   KnowledgeTag      int16         
dtypes: datetime64[ns](1), int16(2), int8(1), object(2)
memory usage: 62.7+ MB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260114 entries, 0 to 260113
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   userID            260114 non-null  int16         
 1   assessmentItemID  260114 non-null  object        
 2   testId            260114 non-null  object        
 3   answerCode        260114 non-null  int8          
 4   Timestamp         260114 non-null  datetime64[ns]
 5   KnowledgeTag      260114 non-null  int16         
dtypes: datetime64[ns](1), int16(2), int8(1), object(2)
memory usage: 7.2+ MB


In [None]:
train.loc[:, ['userID', 'Timestamp']].groupby('userID').diff(periods=-1)["Timestamp"].apply(lambda x: -x)

In [34]:
a = train_df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff(periods=1)['Timestamp'].fillna(pd.Timedelta(seconds=0)).apply(lambda x: x.total_seconds())

In [35]:
a

0                0.0
1                3.0
2                8.0
3                7.0
4                7.0
             ...    
2266581         24.0
2266582    6632178.0
2266583         11.0
2266584         46.0
2266585         73.0
Name: Timestamp, Length: 2266586, dtype: float64

In [50]:
a.astype(np.int32).max()

25602295

In [37]:
a.apply(lambda x: x if x <= 600.0 else 600.0)

0            0.0
1            3.0
2            8.0
3            7.0
4            7.0
           ...  
2266581     24.0
2266582    600.0
2266583     11.0
2266584     46.0
2266585     73.0
Name: Timestamp, Length: 2266586, dtype: float64

In [53]:
train_group = train_df[["userID", "assessmentItemID", "answerCode", "KnowledgeTag"]]\
    .groupby("userID")\
    .apply(lambda r: (r.assessmentItemID.values, r.answerCode.values,
                         r.KnowledgeTag.values))

In [8]:
test_group = test_df[["userID", "assessmentItemID", "answerCode", "KnowledgeTag"]]\
    .groupby("userID")\
    .apply(lambda r: (r.assessmentItemID.values, r.answerCode.values,
                         r.KnowledgeTag.values))

In [9]:
len(test_group)

744

In [64]:
train_group

userID
0       ([A060001001, A060001002, A060001003, A0600010...
1       ([A040013001, A040013002, A040013003, A0400130...
2       ([A030050001, A030050002, A030050003, A0300500...
5       ([A080001001, A080001002, A080001003, A0800010...
6       ([A030016001, A030016002, A030016003, A0300160...
                              ...                        
7436    ([A050095001, A050095002, A050095003, A0500950...
7437    ([A040072001, A040072002, A040072003, A0400720...
7438    ([A080002001, A080002002, A080002003, A0800020...
7440    ([A050096001, A050096002, A050096005, A0500960...
7441    ([A030071001, A030071002, A030071003, A0300710...
Length: 6698, dtype: object

In [69]:
t, v = train_test_split(train_group, test_size=0.1)

In [70]:
t

userID
7423    ([A030170001, A030170002, A030170003, A0301700...
401     ([A070001001, A070001002, A070001003, A0700010...
6533    ([A050001001, A050001002, A050001003, A0500010...
5862    ([A050159001, A050159002, A050159003, A0501590...
4564    ([A070071001, A070071002, A070071003, A0700710...
                              ...                        
2741    ([A040004001, A040004002, A040004003, A0400040...
1778    ([A030017001, A030017002, A030017003, A0300170...
5558    ([A040086001, A040086002, A040086003, A0400860...
5594    ([A040041001, A040041002, A040041003, A0400410...
649     ([A040027001, A040027002, A040027003, A0400270...
Length: 6028, dtype: object

In [71]:
v

userID
3370    ([A050046001, A050046002, A050046003, A0500460...
4845    ([A080003001, A080003002, A080003003, A0800030...
3193    ([A050002001, A050002002, A050002003, A0500020...
6924    ([A080004001, A080004002, A080004003, A0800040...
4331    ([A090003001, A090003003, A090003002, A0900030...
                              ...                        
6092    ([A050107001, A050107002, A050107003, A0501070...
1704    ([A070001001, A070001002, A070001003, A0700010...
6107    ([A030016001, A030016002, A030016003, A0300160...
3778    ([A070003001, A070003002, A070003003, A0700030...
1803    ([A060081001, A060081002, A060081003, A0600810...
Length: 670, dtype: object

In [67]:
test_group

userID
3       ([A050023001, A050023002, A050023003, A0500230...
4       ([A040001001, A040001002, A040001003, A0400010...
13      ([A060011001, A060011002, A060011003, A0600110...
17      ([A040007001, A040007002, A040007003, A0400070...
26      ([A060011001, A060011002, A060011003, A0600110...
                              ...                        
7395    ([A030076001, A030076002, A030076003, A0300760...
7404    ([A020124001, A020124002, A020124003, A0201240...
7416    ([A080002001, A080002002, A080002003, A0800020...
7417    ([A010093001, A010093002, A010093003, A0100930...
7439    ([A040003001, A040003002, A040003003, A0400030...
Length: 744, dtype: object

In [76]:
pd.concat([t, test_group])

userID
7423    ([A030170001, A030170002, A030170003, A0301700...
401     ([A070001001, A070001002, A070001003, A0700010...
6533    ([A050001001, A050001002, A050001003, A0500010...
5862    ([A050159001, A050159002, A050159003, A0501590...
4564    ([A070071001, A070071002, A070071003, A0700710...
                              ...                        
7395    ([A030076001, A030076002, A030076003, A0300760...
7404    ([A020124001, A020124002, A020124003, A0201240...
7416    ([A080002001, A080002002, A080002003, A0800020...
7417    ([A010093001, A010093002, A010093003, A0100930...
7439    ([A040003001, A040003002, A040003003, A0400030...
Length: 6772, dtype: object

In [17]:
pd.Timedeltatest_df['Timestamp']

Unnamed: 0,userID,assessmentItemID,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,1,2020-01-09 10:56:31,2626
1,3,A050023002,1,2020-01-09 10:56:57,2626
2,3,A050023003,0,2020-01-09 10:58:31,2625
3,3,A050023004,0,2020-01-09 10:58:36,2625
4,3,A050023006,0,2020-01-09 10:58:43,2623
...,...,...,...,...,...
260109,7439,A040130001,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,1,2020-10-14 23:09:31,8244


In [65]:
test_df

Unnamed: 0,userID,assessmentItemID,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,1,2020-01-09 10:56:31,2626
1,3,A050023002,1,2020-01-09 10:56:57,2626
2,3,A050023003,0,2020-01-09 10:58:31,2625
3,3,A050023004,0,2020-01-09 10:58:36,2625
4,3,A050023006,0,2020-01-09 10:58:43,2623
...,...,...,...,...,...
260109,7439,A040130001,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,1,2020-10-14 23:09:31,8244


In [77]:
test_group.copy()

userID
3       ([A050023001, A050023002, A050023003, A0500230...
4       ([A040001001, A040001002, A040001003, A0400010...
13      ([A060011001, A060011002, A060011003, A0600110...
17      ([A040007001, A040007002, A040007003, A0400070...
26      ([A060011001, A060011002, A060011003, A0600110...
                              ...                        
7395    ([A030076001, A030076002, A030076003, A0300760...
7404    ([A020124001, A020124002, A020124003, A0201240...
7416    ([A080002001, A080002002, A080002003, A0800020...
7417    ([A010093001, A010093002, A010093003, A0100930...
7439    ([A040003001, A040003002, A040003003, A0400030...
Length: 744, dtype: object

In [10]:
import torch

In [12]:
torch.triu(torch.ones(4, 4), diagonal=1).to(dtype=torch.bool)

tensor([[False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True],
        [False, False, False, False]])

In [13]:
a = np.array([[1, 1], [2, 2], [3, 3]])
a

array([[1, 1],
       [2, 2],
       [3, 3]])

In [14]:
np.insert(a, 1, 5)

array([1, 5, 1, 2, 2, 3, 3])

In [15]:
np.insert(a, 1, 5, axis=1)

array([[1, 5, 1],
       [2, 5, 2],
       [3, 5, 3]])

In [7]:
test['userID'].value_counts()

584     1620
1348    1379
260     1335
617     1333
203     1318
        ... 
5965      15
7033      15
7404      15
7416      15
7417      15
Name: userID, Length: 744, dtype: int64

In [8]:
exe_ids = np.zeros(10, dtype=int)

In [10]:
set(train['assessmentItemID']) == set(test['assessmentItemID'])

True

In [11]:
train['assessmentItemID'].nunique()

9454

In [16]:
a, b = pd.factorize(train['assessmentItemID'], sort=True)

In [19]:
len(b)

9454