# AutoGluon

- [docs](https://auto.gluon.ai/scoredebugweight/index.html#)

---

In [None]:
# !pip install autogluon

In [28]:
from autogluon.tabular import TabularDataset, TabularPredictor
import numpy as np
import os

## 1. Data Load


In [12]:
# data 경로 설정
data_path = '/content/drive/MyDrive/DKT/data'

# Read train data
train_data = TabularDataset(os.path.join(data_path, "train_data_3PL.csv"))
train_data.tail()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Dffclt,Dscrmn,Gussng
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.767458,0.882364,0.123318
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,-0.277564,5.384278,0.099105
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,-0.267161,10.26359,0.035658
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,-0.229779,1.516802,0.513883
2266585,7441,A040165004,A040000165,1,2020-08-21 01:08:49,8836,-0.113676,1.90103,0.013027


## 2. Feature Engineering

In [13]:
import pandas as pd

def feature_engineering(df):

    # 유저별로 정렬
    df.sort_values(by=['userID', 'Timestamp'], inplace=True)

    # 데이터 타입 변경
    dtype = {
        'userID': 'int16',
        'answerCode': 'int8',
        'KnowledgeTag': 'int16'
    }
    df = df.astype(dtype)

    # 'Timestamp' 열을 날짜/시간 형식으로 파싱
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')

    # testTag 추가
    df['testTag'] = df['testId'].apply(lambda x: x[2]).astype('int16')

    # 유저별로 정답 누적 횟수 계산, 결측치 0
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_correct_answer'].fillna(0, inplace=True)

    # 유저별로 제출 누적 횟수 계산
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()

    # 유저별로 누적 정답률 계산, 결측치 0.75
    df['user_acc'] = df['user_correct_answer'] / df['user_total_answer']
    df['user_acc'].fillna(0.75, inplace=True)

    # userID별 정답률 추가
    df['user_sum'] = df.groupby('userID')['answerCode'].transform('sum')
    df['user_mean'] = df.groupby('userID')['answerCode'].transform('mean')

    # assessmentItemID별 정답률 추가
    df['assessment_sum'] = df.groupby('assessmentItemID')['answerCode'].transform('sum')
    df['assessment_mean'] = df.groupby('assessmentItemID')['answerCode'].transform('mean')

    # testId별 정답률 추가
    df['test_sum'] = df.groupby('testId')['answerCode'].transform('sum')
    df['test_mean'] = df.groupby('testId')['answerCode'].transform('mean')

    # KnowledgeTag별 정답률 추가
    df['knowledgeTag_sum'] = df.groupby('KnowledgeTag')['answerCode'].transform('sum')
    df['knowledgeTag_mean'] = df.groupby('KnowledgeTag')['answerCode'].transform('mean')

    # testTag별 정답률 추가
    df['testTag_sum'] = df.groupby('testTag')['answerCode'].transform('sum')
    df['testTag_mean'] = df.groupby('testTag')['answerCode'].transform('mean')

    # 상대적 정답률
    df['relative_answer_assessment'] = df['answerCode'] - df.groupby('assessmentItemID')['answerCode'].transform('mean')

    # 유저별 상대적 정답률 평균 - 학습 수준 레벨
    df['relative_answer_mean'] = df.groupby('userID')['relative_answer_assessment'].transform('mean')

    # 유저가 문항을 푼 시간
    df['time_to_solve'] = df.groupby(['userID', 'testId'])['Timestamp'].diff().dt.total_seconds().shift(-1)

    # 결측치 이전 행 값으로 채움
    df['time_to_solve'].fillna(method='ffill', inplace=True)

    # 최대값 1000으로 처리 이상치 처리
    df.loc[df['time_to_solve'] >= 1000, 'time_to_solve'] = 1000

    # 문항당 문제를 푸는 평균시간
    # 문항당 문제를 푸는 평균시간 - 유저의 문제 푸는 시간을 처리하여, 문제 난이도 처리
    df['time_to_solve_mean'] = df.groupby(['userID', 'testId'])['time_to_solve'].transform('mean')

    df['assessmentItemID_time_to_solve_mean'] = df.groupby('assessmentItemID')['time_to_solve'].transform('mean')
    df['assessmentItemID_time_level'] = df['assessmentItemID_time_to_solve_mean'] - df['time_to_solve']

    # clip(0, 255)는 메모리를 위해 uint8 데이터 타입을 쓰기 위함
    df['prior_assessment_frequency'] = df.groupby(['userID', 'assessmentItemID']).cumcount().clip(0, 255)

    # 각 태그별로 이전에 몇번 풀었는지
    df['prior_KnowledgeTag_frequency'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()

    # 시험지 태그별 학년별 몇번 풀었는지
    df['prior_testTag_frequency'] = df.groupby(['userID', 'testTag']).cumcount()

    return df


In [14]:
train_df = feature_engineering(train_data)

  df['time_to_solve'].fillna(method='ffill', inplace=True)


In [15]:
# FE

FEATS = [
 'KnowledgeTag',
 'Dffclt',
 'Dscrmn',
 'Gussng',
 'testTag',
 'user_correct_answer',
 'user_total_answer',
 'user_acc',
 'user_mean',
 'relative_answer_mean',
 'time_to_solve',
 'time_to_solve_mean',
 'assessmentItemID_time_level',
 'prior_testTag_frequency',

 'answerCode' # target
 ]

In [16]:
train_df = train_df[FEATS]
train_df.tail

<bound method NDFrame.tail of          KnowledgeTag    Dffclt     Dscrmn    Gussng  testTag  \
0                7224 -2.017182  20.079513  0.052178        6   
1                7225 -1.723821   4.616495  0.056888        6   
2                7225 -0.167255  18.583456  0.754422        6   
3                7225  0.496282  39.877030  0.946875        6   
4                7225 -1.335100   6.965071  0.237969        6   
...               ...       ...        ...       ...      ...   
2266581           438  0.767458   0.882364  0.123318        3   
2266582          8836 -0.277564   5.384278  0.099105        4   
2266583          8836 -0.267161  10.263590  0.035658        4   
2266584          8836 -0.229779   1.516802  0.513883        4   
2266585          8836 -0.113676   1.901030  0.013027        4   

         user_correct_answer  user_total_answer  user_acc  user_mean  \
0                        0.0                  0  0.750000   0.630872   
1                        1.0                 

---

## 3. Train

In [17]:
save_path = 'ag'

label = "answerCode"
predictor = TabularPredictor(label=label, problem_type="binary", path=save_path).fit(train_df, presets=["best_quality"] , time_limit = 600)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: ag/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 150s
AutoGluon will save models to "ag/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sat Nov 18 15:31:17 UTC 2023
CPU Count:          2
Memory Avail:      

In [18]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  binary
AutoGluon identified the following types of features:
('float', []) : 10 | ['Dffclt', 'Dscrmn', 'Gussng', 'user_correct_answer', 'user_acc', ...]
('int', [])   :  4 | ['KnowledgeTag', 'testTag', 'user_total_answer', 'prior_testTag_frequency']


In [19]:
predictor.model_names()

['LightGBMXT_BAG_L1',
 'WeightedEnsemble_L2',
 'LightGBMXT_BAG_L2',
 'WeightedEnsemble_L3']

In [20]:
predictor.model_best

'WeightedEnsemble_L2'

## 4. model summary

In [21]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                 model  score_val eval_metric  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    LightGBMXT_BAG_L1   0.775288    accuracy       8.949361  113.226200                8.949361         113.226200            1       True          1
1  WeightedEnsemble_L2   0.775288    accuracy      12.562845  113.869070                3.613484           0.642870            2       True          2
2  WeightedEnsemble_L3   0.775288    accuracy      13.816945  273.249957                4.867584         160.023757            3       True          4
3    LightGBMXT_BAG_L2   0.654378    accuracy      10.095284  175.856515                1.145922          62.630316            2       True          3
Number of models trained: 4
Types of models trained:
{'StackerEnsembleModel_LGB', 'WeightedEnsembleModel'}
Bagging used: True  (with 8 folds)
Multi-layer stack-ensembling used: True  

{'model_types': {'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L3': 'WeightedEnsembleModel'},
 'model_performance': {'LightGBMXT_BAG_L1': 0.7752880323093851,
  'WeightedEnsemble_L2': 0.7752880323093851,
  'LightGBMXT_BAG_L2': 0.654378435232548,
  'WeightedEnsemble_L3': 0.7752880323093851},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'LightGBMXT_BAG_L1': ['LightGBMXT_BAG_L1'],
  'WeightedEnsemble_L2': ['WeightedEnsemble_L2'],
  'LightGBMXT_BAG_L2': ['LightGBMXT_BAG_L2'],
  'WeightedEnsemble_L3': ['WeightedEnsemble_L3']},
 'model_fit_times': {'LightGBMXT_BAG_L1': 113.22619962692261,
  'WeightedEnsemble_L2': 0.6428701877593994,
  'LightGBMXT_BAG_L2': 62.63031578063965,
  'WeightedEnsemble_L3': 160.02375721931458},
 'model_pred_times': {'LightGBMXT_BAG_L1': 8.949361085891724,
  'WeightedEnsemble_L2': 3.6134843826293945,
  'LightGBMXT_BAG_L2': 1.145922422409

## 5. Predict

In [22]:
# data 경로 설정
data_path = '/content/drive/MyDrive/DKT/data'

# Read test data
test_data = TabularDataset(os.path.join(data_path, "test_data_3PL.csv"))
test_df = feature_engineering(test_data)
test_df = test_df[FEATS]

Loaded data from: /content/drive/MyDrive/DKT/data/test_data_3PL.csv | Columns = 9 / 9 | Rows = 260114 -> 260114
  df['time_to_solve'].fillna(method='ffill', inplace=True)


In [24]:
test_df = test_df[test_df['answerCode'] == -1]
test_df.tail()

Unnamed: 0,KnowledgeTag,Dffclt,Dscrmn,Gussng,testTag,user_correct_answer,user_total_answer,user_acc,user_mean,relative_answer_mean,time_to_solve,time_to_solve_mean,assessmentItemID_time_level,prior_testTag_frequency,answerCode
260052,10615,0.550984,12.82007,0.1805534,4,7.0,23,0.304348,0.25,-0.38221,2.0,2.8,112.410256,9,-1
260067,7636,-1.266479,2.958324,1.186053e-09,3,7.0,14,0.5,0.4,-0.319444,107.0,65.2,-60.833333,4,-1
260082,10402,-1.430409,1.836024,8.723365e-05,5,7.0,14,0.5,0.4,-0.220571,24.0,17.75,17.16,3,-1
260097,10402,-1.430409,1.836024,8.723365e-05,5,2.0,14,0.142857,0.066667,-0.645399,21.0,21.0,20.16,3,-1
260113,8832,0.052693,13.85092,0.3003161,4,11.0,15,0.733333,0.625,-0.108488,32.0,38.4,53.619048,15,-1


In [25]:
len(test_df)

744

In [26]:
predictor.leaderboard(test_df, extra_metrics=['accuracy', 'balanced_accuracy', 'log_loss'])

Unnamed: 0,model,score_test,accuracy,balanced_accuracy,log_loss,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L1,0.0,0.0,0.666667,-1.176081,0.775288,accuracy,0.102684,8.949361,113.2262,0.102684,8.949361,113.2262,1,True,1
1,WeightedEnsemble_L3,0.0,0.0,0.666667,-1.176081,0.775288,accuracy,0.104364,13.816945,273.249957,0.00168,4.867584,160.023757,3,True,4
2,WeightedEnsemble_L2,0.0,0.0,0.666667,-1.176081,0.775288,accuracy,0.105025,12.562845,113.86907,0.002341,3.613484,0.64287,2,True,2
3,LightGBMXT_BAG_L2,0.0,0.0,0.5,-1.654749,0.654378,accuracy,0.150822,10.095284,175.856515,0.048137,1.145922,62.630316,2,True,3


In [27]:
# 이진 분류 예측

y_pred = predictor.predict(test_df)
y_pred

1035      1
1706      1
3023      0
4283      1
4670      0
         ..
260052    0
260067    1
260082    1
260097    1
260113    1
Name: answerCode, Length: 744, dtype: int8

In [30]:
# 확률 예측

y_probabilities = predictor.predict_proba(test_df)
y_probabilities

Unnamed: 0,0,1
1035,0.395996,0.604004
1706,0.253983,0.746017
3023,0.587725,0.412275
4283,0.277558,0.722442
4670,0.575960,0.424040
...,...,...
260052,0.728652,0.271348
260067,0.222105,0.777895
260082,0.258163,0.741837
260097,0.281351,0.718649


In [31]:
# 1일 확률 가져오기

probability_of_1 = y_probabilities.iloc[:, 1]
probability_of_1

1035      0.604004
1706      0.746017
3023      0.412275
4283      0.722442
4670      0.424040
            ...   
260052    0.271348
260067    0.777895
260082    0.741837
260097    0.718649
260113    0.574031
Name: 1, Length: 744, dtype: float64

In [None]:
# 결과를 파일로 저장

output_dir = 'output/'

write_path = os.path.join(output_dir, "submission_WeightedEnsemble_L3_.csv")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction: {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(probability_of_1):
        w.write('{},{}\n'.format(id, p))
        print(id, p)