In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

In [2]:
def percentile(s):
    return round(np.sum(s) / len(s)*100, 2)

In [3]:
def avgLV(data):
    tags = data['KnowledgeTag'].unique()
    LV = []
    
    for tid in tags:
        LV.append(data[data['KnowledgeTag']==tid]['userLV_Tag'].iloc[0])
    return round(np.mean(LV))

### 데이터 로드

In [4]:
warnings.filterwarnings(action='ignore')

In [5]:
print(f'Loading Data ...')
total = pd.read_csv('../data/total_data.csv')
train = total[total['answerCode']>=0]
print(f'Total Data: {len(total)}')
print(f'Feature Engineering Data (only answerCode >= 0 -> train data): {len(train)}')

# 1. 태그별 정답률 & 유저별 태그 정답률
print(f'Calculating correct tagRatio and userBytagRatio ...')
train['tagRatio'] = train.groupby('KnowledgeTag').answerCode.transform(percentile)
train['userBytagRatio'] = train.groupby(['userID', 'KnowledgeTag']).answerCode.transform(percentile)

# 2.유저별 정답률
print(f' Calculating correct userRatio ...')
train['userRatio'] = train.groupby('userID').answerCode.transform(percentile)
print(f'Done!!\n')

feature_by_test = train.drop(['assessmentItemID', 'Timestamp', 'testId', 'answerCode'], axis=1)

Loading Data ...
Total Data: 2513268
Feature Engineering Data (only answerCode >= 0 -> train data): 2513268
Calculating correct tagRatio and userBytagRatio ...
 Calculating correct userRatio ...
Done!!



### KnowledgeTag Level

In [6]:
tag_mean_ratio = round(feature_by_test.groupby('KnowledgeTag').mean()['tagRatio'].mean(), 2)
diff =  feature_by_test['tagRatio'] - tag_mean_ratio
pre_std = int(diff.min()//10)*10
last_std = int(diff.max()//10+1)*10
LV = (last_std - pre_std)//10

print('Labeling for Tag Level...')
print(f'tag mean ratio: {tag_mean_ratio}%')
print(f'diff min({round(diff.min(), 2)}%), diff max({round(diff.max(), 2)}%)\n')

Labeling for Tag Level...
tag mean ratio: 61.65%
diff min(-44.05%), diff max(35.59%)



In [7]:
for std in tqdm(range(pre_std+10, last_std+1, 10)):
    print(f'{pre_std:3d}% < diff <= {std:3d}%  |  Level: {LV}')
    idx = (pre_std < diff) & (diff <= std)
    feature_by_test.loc[idx, 'tagLV'] = LV
    pre_std = std
    LV -= 1

print()
print('Done!!')
print(f'Check all Tag Level: {sorted(feature_by_test["tagLV"].unique())}\n')

100%|██████████| 9/9 [00:00<00:00, 94.74it/s]

-50% < diff <= -40%  |  Level: 9
-40% < diff <= -30%  |  Level: 8
-30% < diff <= -20%  |  Level: 7
-20% < diff <= -10%  |  Level: 6
-10% < diff <=   0%  |  Level: 5
  0% < diff <=  10%  |  Level: 4
 10% < diff <=  20%  |  Level: 3
 20% < diff <=  30%  |  Level: 2
 30% < diff <=  40%  |  Level: 1

Done!!
Check all Tag Level: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]






### User Level by KnowledgeTag

In [8]:
diff =  feature_by_test['userBytagRatio'] - feature_by_test['tagRatio']
pre_std = int(diff.min()//10)*10
last_std = int(diff.max()//10+1)*10
LV = 1

print('Labeling for User Level by Tag...')
print(f'diff min({round(diff.min(), 2)}%), diff max({round(diff.max(), 2)}%)\n')

Labeling for User Level by Tag...
diff min(-97.24%), diff max(82.4%)



In [9]:
for std in tqdm(range(pre_std+10, last_std+1, 10)):
    print(f'[DEBUG] {pre_std:3d}% < diff <= {std:3d}%  |  Level: {LV}')
    idx = (pre_std < diff) & (diff <= std)
    feature_by_test.loc[idx, 'userLV'] = LV
    pre_std = std
    LV += 1

print()
print('Done!!')
print(f'Check all User Level: {sorted(feature_by_test["userLV"].unique())}\n')

 58%|█████▊    | 11/19 [00:00<00:00, 100.00it/s]

[DEBUG] -100% < diff <= -90%  |  Level: 1
[DEBUG] -90% < diff <= -80%  |  Level: 2
[DEBUG] -80% < diff <= -70%  |  Level: 3
[DEBUG] -70% < diff <= -60%  |  Level: 4
[DEBUG] -60% < diff <= -50%  |  Level: 5
[DEBUG] -50% < diff <= -40%  |  Level: 6
[DEBUG] -40% < diff <= -30%  |  Level: 7
[DEBUG] -30% < diff <= -20%  |  Level: 8
[DEBUG] -20% < diff <= -10%  |  Level: 9
[DEBUG] -10% < diff <=   0%  |  Level: 10
[DEBUG]   0% < diff <=  10%  |  Level: 11
[DEBUG]  10% < diff <=  20%  |  Level: 12
[DEBUG]  20% < diff <=  30%  |  Level: 13
[DEBUG]  30% < diff <=  40%  |  Level: 14
[DEBUG]  40% < diff <=  50%  |  Level: 15


100%|██████████| 19/19 [00:00<00:00, 101.61it/s]

[DEBUG]  50% < diff <=  60%  |  Level: 16
[DEBUG]  60% < diff <=  70%  |  Level: 17
[DEBUG]  70% < diff <=  80%  |  Level: 18
[DEBUG]  80% < diff <=  90%  |  Level: 19

Done!!
Check all User Level: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0]






### Labeling for Total User Level

In [10]:
print('Labeling for Total User Level (tagLV x userLV) ...')
feature_by_test['userLV_Tag'] = feature_by_test['tagLV']*feature_by_test['userLV']
print('Done!!')
print(f'Num of User Level: {len(feature_by_test["userLV_Tag"].unique())}')
print(f'Check all User Level: {sorted(feature_by_test["userLV_Tag"].unique())}\n')

warnings.filterwarnings(action='default')

Labeling for Total User Level (tagLV x userLV) ...
Done!!
Num of User Level: 67
Check all User Level: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 14.0, 15.0, 16.0, 18.0, 20.0, 21.0, 22.0, 24.0, 25.0, 27.0, 28.0, 30.0, 32.0, 33.0, 35.0, 36.0, 39.0, 40.0, 42.0, 44.0, 45.0, 48.0, 49.0, 50.0, 52.0, 54.0, 55.0, 56.0, 60.0, 63.0, 64.0, 65.0, 66.0, 70.0, 72.0, 75.0, 77.0, 78.0, 80.0, 81.0, 84.0, 88.0, 90.0, 91.0, 96.0, 98.0, 104.0, 105.0, 112.0, 119.0, 120.0, 126.0, 136.0, 144.0, 171.0]



In [36]:
print(f'Labeling for User Average Level ...')
user_grouby = feature_by_test.groupby('userID').apply(avgLV)
for uid in tqdm(user_grouby.index):
    feature_by_test.loc[feature_by_test['userID']==uid, 'userLV_Tag_avg'] = user_grouby.loc[uid]
levels = feature_by_test['userLV_Tag_avg'].unique()
print(f'Done!!')
print(f'Num of User AVG Levels: {len(levels)}')
print(f'Level(min): {min(levels)}, Level(max): {max(levels)}')
print(f'Check all User AVG Level: {sorted(levels)}\n')

Labeling for User Average Level ...


100%|██████████| 6844/6844 [00:42<00:00, 161.33it/s]

Done!!
Num of User AVG Levels: 70
Level(min): 11.0, Level(max): 91.0
Check all User AVG Level: [11.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 84.0, 91.0]






### Merge with Total Dataset

In [37]:
print(f'Merge with Total Dataset ...\n')
idx = feature_by_test.index
columns = ['tagLV', 'userLV_Tag_avg', 'userLV_Tag']
for column in tqdm(columns):
    total.loc[idx, column] = feature_by_test[column]

Merge with Total Dataset ...



100%|██████████| 3/3 [00:00<00:00,  9.90it/s]


In [39]:
print(f'Feature Engineering to Inference data ...')
test = total[total['answerCode']<0]
for idx in tqdm(test.index):
    uid = test.loc[idx, 'userID']
    tagid = test.loc[idx, 'KnowledgeTag']
    total.loc[idx, 'tagLV'] = total[total['KnowledgeTag']==tagid]['tagLV'].iloc[0]
    avg = total[total['userID']==uid]['userLV_Tag_avg'].iloc[0]
    lv = total[(total['userID']==uid)&(total['KnowledgeTag']==tagid)]['userLV_Tag'].iloc[0]
    if np.isnan(lv): # 유저가 tag를 한 번도 풀어본 적이 없는 경우, 유저의 평균 lv를 넣어줌 
        total.loc[idx, 'userLV_Tag'] = avg
    else:
        total.loc[idx, 'userLV_Tag'] = lv
    total.loc[idx, 'userLV_Tag_avg'] = avg

Feature Engineering to Inference data ...


0it [00:00, ?it/s]


### Save

In [40]:
print(f'Saving ...')
total = total.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)
total.to_csv('../data/total_data_2.csv', index=False)
print('Done!!')
print(f'Check your "../data/total_data.csv"')

Saving ...
Done!!
Check your "../data/total_data.csv"


In [41]:
total.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'same_item_cnt', 'prior_elapsed',
       'current_elapsed', 'timeClass', 'day_diff', 'Bigcat', 'user_avg',
       'test_avg', 'item_avg', 'Bigcat_avg', 'tag_avg', 'user_time_avg',
       'test_time_avg', 'item_time_avg', 'Bigcat_time_avg', 'tag_time_avg',
       'user_std', 'test_std', 'item_std', 'Bigcat_std', 'tag_std',
       'user_correct_answer', 'user_total_answer', 'user_Cumacc',
       'user_Bigcat_correct_answer', 'user_Bigcat_total_answer',
       'user_Bigcat_Cumacc', 'user_current_avg', 'user_current_time_avg',
       'Bigcat_class', 'assess_count', 'tag_count', 'item_seq',
       'user_retCount_correct_answer', 'user_retCount', 'user_retCumacc',
       'elo', 'KnowledgeTag_elo', 'Bigcat_elo', 'tagLV', 'userLV_Tag_avg',
       'userLV_Tag'],
      dtype='object')

In [42]:
total.isna().sum()

userID                          0
assessmentItemID                0
testId                          0
answerCode                      0
Timestamp                       0
KnowledgeTag                    0
train                           0
same_item_cnt                   0
prior_elapsed                   0
current_elapsed                 0
timeClass                       0
day_diff                        0
Bigcat                          0
user_avg                        0
test_avg                        0
item_avg                        0
Bigcat_avg                      0
tag_avg                         0
user_time_avg                   0
test_time_avg                   0
item_time_avg                   0
Bigcat_time_avg                 0
tag_time_avg                    0
user_std                        0
test_std                        0
item_std                        0
Bigcat_std                      0
tag_std                         0
user_correct_answer             0
user_total_ans