In [1]:
import pandas as pd
pd.pandas.set_option('display.max_columns', None) 
from Feature_Engineering import FeatureEngineering
import warnings
warnings.filterwarnings('ignore')

dtype = {'userID' : 'int16',
         'answerCode' : 'int8',
         'KnowledgeTag' : 'int16'}   

DATA_PATH = '/data/ephemeral/data/'

train = pd.read_csv(DATA_PATH + 'train_data.csv', dtype = dtype, parse_dates = ['Timestamp'])
train = train.sort_values(by = ['userID', 'Timestamp']).reset_index(drop = True)

test = pd.read_csv(DATA_PATH + 'test_data.csv', dtype = dtype, parse_dates = ['Timestamp'])
test = test.sort_values(by = ['userID', 'Timestamp']).reset_index(drop = True)

data = pd.concat([train, test], axis = 0).reset_index(drop = True).sort_values(by = ['userID', 'Timestamp'])
fe = FeatureEngineering(data)
data.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


In [2]:
Feature_1 = pd.DataFrame({
    ### 시험지 관련 변수
    'userID' : fe.Feature_UserID(data),
    'assessmentItemID' : fe.Feature_ItemID(data),
    'answerCode' : fe.Feature_answerCode(data),
    'KnowledgeTag' : fe.Feature_Tag(data),
    'testID' : fe.Feature_testID(data),
    'testCode' : fe.Feature_testCode(data),
    'testNum' : fe.Feature_testNum(data),
    'problemID' : fe.Feature_problemID(data),
    'problemID_Norm' : fe.Feature_problemID_Norm(data),
    'total_problem' : fe.Feature_Total_Problem(data),
})
Feature_1

Unnamed: 0,userID,assessmentItemID,answerCode,KnowledgeTag,testID,testCode,testNum,problemID,problemID_Norm,total_problem
0,0,A060001001,1,7224,060001,6,1,1,0.000000,7
1,0,A060001002,1,7225,060001,6,1,2,0.083333,7
2,0,A060001003,1,7225,060001,6,1,3,0.166667,7
3,0,A060001004,1,7225,060001,6,1,4,0.250000,7
4,0,A060001005,1,7225,060001,6,1,5,0.333333,7
...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,0,438,030071,3,71,5,0.333333,5
2526696,7441,A040165001,1,8836,040165,4,165,1,0.000000,4
2526697,7441,A040165002,1,8836,040165,4,165,2,0.083333,4
2526698,7441,A040165003,1,8836,040165,4,165,3,0.166667,4


In [3]:
Feature_2 = pd.DataFrame({
    ### Timestamp 관련 변수
    'year' : fe.Feature_year(data),
    'quarter' : fe.Feature_quarter(data),
    'month' : fe.Feature_month(data),
    'day' : fe.Feature_day(data),
    'hour' : fe.Feature_hour(data),
    'dow' : fe.Feature_dow(data),
    'weekday' : fe.Feature_weekday(data),
    'LagTime' : fe.Feature_LagTime(data),
    'ElapsedTime' : fe.Feature_ElapsedTime(data),
    'ElapsedTime_Rolling2' : fe.Feature_ElapsedTime_Rolling_Average2(data),
    'ElapsedTime_Rolling3' : fe.Feature_ElapsedTime_Rolling_Average3(data),
    'ElapsedTime_Rolling4' : fe.Feature_ElapsedTime_Rolling_Average4(data),
    'ElapsedTime_Rolling5' : fe.Feature_ElapsedTime_Rolling_Average5(data),
    'user_ElaspedTime_avg' : fe.Feature_User_ElapsedTime_Average(data),
    'item_ElaspedTime_avg' : fe.Feature_Item_ElapsedTime_Average(data),
    'testID_ElaspedTime_avg' : fe.Feature_testID_ElapsedTime_Average(data),
    'testCode_ElaspedTime_avg' : fe.Feature_testCode_ElapsedTime_Average(data),
    'testNum_ElaspedTime_avg' : fe.Feature_testNum_ElapsedTime_Average(data),
    'problemID_ElaspedTime_avg' : fe.Feature_problemID_ElapsedTime_Average(data),
    'tag_ElaspedTime_avg' : fe.Feature_Tag_ElapsedTime_Average(data),
    'Real_Solved' : fe.Feature_Real_Solved(data), 
    'Correct_User_ElapsedTime' : fe.Feature_Correct_ElapsedTime_Average(data),
    'Wrong_User_ElapsedTime' : fe.Feature_Wrong_ElapsedTime_Average(data),
})
Feature_2

Unnamed: 0,year,quarter,month,day,hour,dow,weekday,LagTime,ElapsedTime,ElapsedTime_Rolling2,ElapsedTime_Rolling3,ElapsedTime_Rolling4,ElapsedTime_Rolling5,user_ElaspedTime_avg,item_ElaspedTime_avg,testID_ElaspedTime_avg,testCode_ElaspedTime_avg,testNum_ElaspedTime_avg,problemID_ElaspedTime_avg,tag_ElaspedTime_avg,Real_Solved,Correct_User_ElapsedTime,Wrong_User_ElapsedTime
0,2020,1,3,24,0,1,1,0.000000,0.0,0.0,0.000000,0.00,0.0,38.262057,0.079794,18.804632,66.971222,37.934893,3.989678,13.583961,0,0.081091,0.000000
1,2020,1,3,24,0,1,1,0.000000,3.0,1.5,0.000000,0.00,0.0,38.262057,13.660000,18.804632,66.971222,37.934893,73.232491,28.919573,0,11.363636,83.125000
2,2020,1,3,24,0,1,1,0.000000,8.0,5.5,3.666667,0.00,0.0,38.262057,26.112000,18.804632,66.971222,37.934893,73.933021,28.919573,0,24.637555,42.190476
3,2020,1,3,24,0,1,1,0.000000,7.0,7.5,6.000000,4.50,0.0,38.262057,19.180000,18.804632,66.971222,37.934893,70.596414,28.919573,0,19.572016,5.571429
4,2020,1,3,24,0,1,1,0.000000,7.0,7.0,7.333333,6.25,5.0,38.262057,18.076000,18.804632,66.971222,37.934893,69.928460,28.919573,0,18.569620,9.076923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,2020,2,6,5,6,4,1,0.000000,24.0,29.0,32.666667,55.00,44.0,38.888889,85.426428,54.101852,60.348466,50.247548,69.928460,54.958278,1,110.552239,65.144147
2526696,2020,3,8,21,1,4,1,110.539963,0.0,12.0,19.333333,24.50,44.0,38.888889,0.706391,48.476262,60.234290,59.901487,3.989678,46.705633,0,0.144703,1.719530
2526697,2020,3,8,21,1,4,1,110.539963,11.0,5.5,11.666667,17.25,21.8,38.888889,46.220000,48.476262,60.234290,59.901487,73.232491,46.705633,1,43.744792,50.620370
2526698,2020,3,8,21,1,4,1,110.539963,46.0,28.5,19.000000,20.25,23.0,38.888889,34.720000,48.476262,60.234290,59.901487,73.933021,46.705633,1,35.144068,33.156250


In [4]:
Feature_3 = pd.DataFrame({
    ### 유저 별로 변수 별 정답 수 / 문제 풀이 수 / 정답률 계산 (시간 순으로 누적)
    'user_sum' : fe.Feature_User_Sum(data),
    'user_cnt' : fe.Feature_User_Count(data),
    'user_acc' : fe.Feature_User_Acc(data),
    'user_itemID_sum' : fe.Feature_User_Item_Sum(data),
    'user_itemID_cnt' : fe.Feature_User_Item_Count(data),
    'user_itemID_acc' : fe.Feature_User_Item_Acc(data),
    'user_testID_sum' : fe.Feature_User_testID_Sum(data),
    'user_testID_cnt' : fe.Feature_User_testID_Count(data),
    'user_testID_acc' : fe.Feature_User_testID_Acc(data),
    'user_testCode_sum' : fe.Feature_User_testCode_Sum(data),
    'user_testCode_cnt' : fe.Feature_User_testCode_Count(data),
    'user_testCode_acc' : fe.Feature_User_testCode_Acc(data),
    'user_testNum_sum' : fe.Feature_User_testNum_Sum(data),
    'user_testNum_cnt' : fe.Feature_User_testNum_Count(data),
    'user_testNum_acc' : fe.Feature_User_testNum_Acc(data),
    'user_problemID_sum' : fe.Feature_User_problemID_Sum(data),
    'user_problemID_cnt' : fe.Feature_User_problemID_Count(data),
    'user_problemID_acc' : fe.Feature_User_problemID_Acc(data),
    'user_tag_sum' : fe.Feature_User_Tag_Sum(data),
    'user_tag_cnt' : fe.Feature_User_Tag_Count(data),
    'user_tag_acc' : fe.Feature_User_Tag_Acc(data),
})
Feature_3

Unnamed: 0,user_sum,user_cnt,user_acc,user_itemID_sum,user_itemID_cnt,user_itemID_acc,user_testID_sum,user_testID_cnt,user_testID_acc,user_testCode_sum,user_testCode_cnt,user_testCode_acc,user_testNum_sum,user_testNum_cnt,user_testNum_acc,user_problemID_sum,user_problemID_cnt,user_problemID_acc,user_tag_sum,user_tag_cnt,user_tag_acc
0,0,0,0.000000,0,0,0.0,0,0,0.00,0,0,0.00,0,0,0.00,0,0,0.0,0,0,0.00
1,1,1,1.000000,0,0,0.0,1,1,1.00,1,1,1.00,1,1,1.00,0,0,0.0,0,0,0.00
2,2,2,1.000000,0,0,0.0,2,2,1.00,2,2,1.00,2,2,1.00,0,0,0.0,1,1,1.00
3,3,3,1.000000,0,0,0.0,3,3,1.00,3,3,1.00,3,3,1.00,0,0,0.0,2,2,1.00
4,4,4,1.000000,0,0,0.0,4,4,1.00,4,4,1.00,4,4,1.00,0,0,0.0,3,3,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,1,4,0.250000,0,0,0.0,1,4,0.25,1,4,0.25,1,4,0.25,0,0,0.0,1,4,0.25
2526696,1,5,0.200000,0,0,0.0,0,0,0.00,0,0,0.00,0,0,0.00,0,1,0.0,0,0,0.00
2526697,2,6,0.333333,0,0,0.0,1,1,1.00,1,1,1.00,1,1,1.00,0,1,0.0,1,1,1.00
2526698,3,7,0.428571,0,0,0.0,2,2,1.00,2,2,1.00,2,2,1.00,1,1,1.0,2,2,1.00


In [5]:
Feature_4 = pd.DataFrame({
    ### 전체 유저에 대해 여러 변수 별 정답 수 / 문제 풀이 수 / 정답률 계산
    'itemID_sum' : fe.Feature_Item_Sum(data),
    'itemID_cnt' : fe.Feature_Item_Count(data),
    'itemID_acc' : fe.Feature_Item_Acc(data),
    'testID_sum' : fe.Feature_testID_Sum(data),
    'testID_cnt' : fe.Feature_testID_Count(data),
    'testID_acc' : fe.Feature_testID_Acc(data),
    'testCode_sum' : fe.Feature_testCode_Sum(data),
    'testCode_cnt' : fe.Feature_testCode_Count(data),
    'testCode_acc' : fe.Feature_testCode_Acc(data),
    'testNum_sum' : fe.Feature_testNum_Sum(data),
    'testNum_cnt' : fe.Feature_testNum_Count(data),
    'testNum_acc' : fe.Feature_testNum_Acc(data), 
    'problemID_sum' : fe.Feature_problemID_Sum(data),
    'problemID_cnt' : fe.Feature_problemID_Count(data),
    'problemID_acc' : fe.Feature_problemID_Acc(data),
    'tag_sum' : fe.Feature_Tag_Sum(data),
    'tag_cnt' : fe.Feature_Tag_Count(data),
    'tag_acc' : fe.Feature_Tag_Acc(data),
    'itemID_high_freq' : fe.Feature_Item_High_Freq(data),
    'testID_high_freq' : fe.Feature_testID_High_Freq(data),
    'testCode_high_freq' : fe.Feature_testCode_High_Freq(data),
    'testNum_high_freq' : fe.Feature_testNum_High_Freq(data),
    'problemID_high_freq' : fe.Feature_problemID_High_Freq(data),
    'tag_high_freq' : fe.Feature_Tag_High_Freq(data),
})
Feature_4

Unnamed: 0,itemID_sum,itemID_cnt,itemID_acc,testID_sum,testID_cnt,testID_acc,testCode_sum,testCode_cnt,testCode_acc,testNum_sum,testNum_cnt,testNum_acc,problemID_sum,problemID_cnt,problemID_acc,tag_sum,tag_cnt,tag_acc,itemID_high_freq,testID_high_freq,testCode_high_freq,testNum_high_freq,problemID_high_freq,tag_high_freq
0,246,250,0.984000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,310634,414350,0.749690,718,750,0.957333,0,0,1,1,1,0
1,242,250,0.968000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,297842,413500,0.720295,3439,3750,0.917067,0,0,1,1,1,1
2,229,250,0.916000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,285143,414250,0.688336,3439,3750,0.917067,0,0,1,1,1,1
3,243,250,0.972000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,273382,412050,0.663468,3439,3750,0.917067,0,0,1,1,1,1
4,237,250,0.948000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,240452,401900,0.598288,3439,3750,0.917067,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,134,300,0.446667,999,1500,0.666000,212941,303450,0.701733,9783,15250,0.641508,240452,401900,0.598288,3127,4500,0.694889,1,0,1,1,1,1
2526696,193,300,0.643333,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,310634,414350,0.749690,2410,3450,0.698551,1,0,1,0,1,1
2526697,192,300,0.640000,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,297842,413500,0.720295,2410,3450,0.698551,1,0,1,0,1,1
2526698,236,300,0.786667,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,285143,414250,0.688336,2410,3450,0.698551,1,0,1,0,1,1


In [6]:
Feature_5 = pd.DataFrame({
    ### 과거 정보 활용 변수
    'user_past_solved' : fe.Feature_User_Past_Solved(data),
    'relative_correct_rate' : fe.Feature_Relative_Correct_Rate(data),
    'is_correct_before1' : fe.Feature_Is_Correct_Before1(data),
    'correct_rate_before1' : fe.Feature_Correct_Rate_Before1(data),
    'relative_correct_rate_before1' : fe.Feature_Relative_Correct_Rate_Before1(data),
    'is_correct_before2' : fe.Feature_Is_Correct_Before2(data),
    'correct_rate_before2' : fe.Feature_Correct_Rate_Before2(data),
    'relative_correct_rate_before2' : fe.Feature_Relative_Correct_Rate_Before2(data),
    'is_correct_before3' : fe.Feature_Is_Correct_Before3(data),
    'correct_rate_before3' : fe.Feature_Correct_Rate_Before3(data),
    'relative_correct_rate_before3' : fe.Feature_Relative_Correct_Rate_Before3(data),
    'is_correct_before4' : fe.Feature_Is_Correct_Before4(data),
    'correct_rate_before4' : fe.Feature_Correct_Rate_Before4(data),
    'relative_correct_rate_before4' : fe.Feature_Relative_Correct_Rate_Before4(data),
    'is_correct_before5' : fe.Feature_Is_Correct_Before5(data),
    'correct_rate_before5' : fe.Feature_Correct_Rate_Before5(data),
    'relative_correct_rate_before5' : fe.Feature_Relative_Correct_Rate_Before5(data),
})
Feature_5

Unnamed: 0,user_past_solved,relative_correct_rate,is_correct_before1,correct_rate_before1,relative_correct_rate_before1,is_correct_before2,correct_rate_before2,relative_correct_rate_before2,is_correct_before3,correct_rate_before3,relative_correct_rate_before3,is_correct_before4,correct_rate_before4,relative_correct_rate_before4,is_correct_before5,correct_rate_before5,relative_correct_rate_before5
0,0,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0
1,0,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0
2,0,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0
3,0,0.028000,1,0.916000,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.0,1.0
4,0,0.052000,1,0.972000,0.028000,1,0.916000,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,0,-0.446667,0,0.593333,-0.593333,1,0.843333,0.156667,0,0.870000,-0.870000,0,0.576667,-0.576667,1,1.0,1.0
2526696,0,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0
2526697,0,0.360000,1,0.643333,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0
2526698,0,0.213333,1,0.640000,0.360000,1,0.643333,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0


In [7]:
Feature_6 = pd.DataFrame({
    ### ELO Rating 변수
    'theta' : fe.Feature_ELO_Theta(data),
    'beta'  : fe.Feature_ELO_Beta(data),
})
Feature_6

Successfully Read User (theta) / Item (beta) parameter file.
Successfully Read User (theta) / Item (beta) parameter file.


Unnamed: 0,theta,beta
0,0.578202,-3.342176
1,0.578202,-3.003042
2,0.578202,-2.298332
3,0.578202,-3.064188
4,0.578202,-2.647697
...,...,...
2266581,0.053649,0.829114
2266582,0.053649,-0.214177
2266583,0.053649,-0.322165
2266584,0.053649,-0.822669


In [8]:
final_data = pd.concat([eval(f'Feature_{i+1}') for i in range(6)], axis = 1)
final_data

Unnamed: 0,userID,assessmentItemID,answerCode,KnowledgeTag,testID,testCode,testNum,problemID,problemID_Norm,total_problem,year,quarter,month,day,hour,dow,weekday,LagTime,ElapsedTime,ElapsedTime_Rolling2,ElapsedTime_Rolling3,ElapsedTime_Rolling4,ElapsedTime_Rolling5,user_ElaspedTime_avg,item_ElaspedTime_avg,testID_ElaspedTime_avg,testCode_ElaspedTime_avg,testNum_ElaspedTime_avg,problemID_ElaspedTime_avg,tag_ElaspedTime_avg,Real_Solved,Correct_User_ElapsedTime,Wrong_User_ElapsedTime,user_sum,user_cnt,user_acc,user_itemID_sum,user_itemID_cnt,user_itemID_acc,user_testID_sum,user_testID_cnt,user_testID_acc,user_testCode_sum,user_testCode_cnt,user_testCode_acc,user_testNum_sum,user_testNum_cnt,user_testNum_acc,user_problemID_sum,user_problemID_cnt,user_problemID_acc,user_tag_sum,user_tag_cnt,user_tag_acc,itemID_sum,itemID_cnt,itemID_acc,testID_sum,testID_cnt,testID_acc,testCode_sum,testCode_cnt,testCode_acc,testNum_sum,testNum_cnt,testNum_acc,problemID_sum,problemID_cnt,problemID_acc,tag_sum,tag_cnt,tag_acc,itemID_high_freq,testID_high_freq,testCode_high_freq,testNum_high_freq,problemID_high_freq,tag_high_freq,user_past_solved,relative_correct_rate,is_correct_before1,correct_rate_before1,relative_correct_rate_before1,is_correct_before2,correct_rate_before2,relative_correct_rate_before2,is_correct_before3,correct_rate_before3,relative_correct_rate_before3,is_correct_before4,correct_rate_before4,relative_correct_rate_before4,is_correct_before5,correct_rate_before5,relative_correct_rate_before5,theta,beta
0,0,A060001001,1,7224,060001,6,1,1,0.000000,7,2020,1,3,24,0,1,1,0.000000,0.0,0.0,0.000000,0.00,0.0,38.262057,0.079794,18.804632,66.971222,37.934893,3.989678,13.583961,0,0.081091,0.000000,0,0,0.000000,0,0,0.0,0,0,0.00,0,0,0.00,0,0,0.00,0,0,0.0,0,0,0.00,246,250,0.984000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,310634,414350,0.749690,718,750,0.957333,0,0,1,1,1,0,0,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-3.342176
1,0,A060001002,1,7225,060001,6,1,2,0.083333,7,2020,1,3,24,0,1,1,0.000000,3.0,1.5,0.000000,0.00,0.0,38.262057,13.660000,18.804632,66.971222,37.934893,73.232491,28.919573,0,11.363636,83.125000,1,1,1.000000,0,0,0.0,1,1,1.00,1,1,1.00,1,1,1.00,0,0,0.0,0,0,0.00,242,250,0.968000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,297842,413500,0.720295,3439,3750,0.917067,0,0,1,1,1,1,0,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-3.003042
2,0,A060001003,1,7225,060001,6,1,3,0.166667,7,2020,1,3,24,0,1,1,0.000000,8.0,5.5,3.666667,0.00,0.0,38.262057,26.112000,18.804632,66.971222,37.934893,73.933021,28.919573,0,24.637555,42.190476,2,2,1.000000,0,0,0.0,2,2,1.00,2,2,1.00,2,2,1.00,0,0,0.0,1,1,1.00,229,250,0.916000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,285143,414250,0.688336,3439,3750,0.917067,0,0,1,1,1,1,0,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-2.298332
3,0,A060001004,1,7225,060001,6,1,4,0.250000,7,2020,1,3,24,0,1,1,0.000000,7.0,7.5,6.000000,4.50,0.0,38.262057,19.180000,18.804632,66.971222,37.934893,70.596414,28.919573,0,19.572016,5.571429,3,3,1.000000,0,0,0.0,3,3,1.00,3,3,1.00,3,3,1.00,0,0,0.0,2,2,1.00,243,250,0.972000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,273382,412050,0.663468,3439,3750,0.917067,0,0,1,1,1,1,0,0.028000,1,0.916000,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-3.064188
4,0,A060001005,1,7225,060001,6,1,5,0.333333,7,2020,1,3,24,0,1,1,0.000000,7.0,7.0,7.333333,6.25,5.0,38.262057,18.076000,18.804632,66.971222,37.934893,69.928460,28.919573,0,18.569620,9.076923,4,4,1.000000,0,0,0.0,4,4,1.00,4,4,1.00,4,4,1.00,0,0,0.0,3,3,1.00,237,250,0.948000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,240452,401900,0.598288,3439,3750,0.917067,0,0,1,1,1,1,0,0.052000,1,0.972000,0.028000,1,0.916000,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.0,1.0,0.578202,-2.647697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,0,438,030071,3,71,5,0.333333,5,2020,2,6,5,6,4,1,0.000000,24.0,29.0,32.666667,55.00,44.0,38.888889,85.426428,54.101852,60.348466,50.247548,69.928460,54.958278,1,110.552239,65.144147,1,4,0.250000,0,0,0.0,1,4,0.25,1,4,0.25,1,4,0.25,0,0,0.0,1,4,0.25,134,300,0.446667,999,1500,0.666000,212941,303450,0.701733,9783,15250,0.641508,240452,401900,0.598288,3127,4500,0.694889,1,0,1,1,1,1,0,-0.446667,0,0.593333,-0.593333,1,0.843333,0.156667,0,0.870000,-0.870000,0,0.576667,-0.576667,1,1.0,1.0,0.152849,1.028286
2526696,7441,A040165001,1,8836,040165,4,165,1,0.000000,4,2020,3,8,21,1,4,1,110.539963,0.0,12.0,19.333333,24.50,44.0,38.888889,0.706391,48.476262,60.234290,59.901487,3.989678,46.705633,0,0.144703,1.719530,1,5,0.200000,0,0,0.0,0,0,0.00,0,0,0.00,0,0,0.00,0,1,0.0,0,0,0.00,193,300,0.643333,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,310634,414350,0.749690,2410,3450,0.698551,1,0,1,0,1,1,0,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.152849,0.411999
2526697,7441,A040165002,1,8836,040165,4,165,2,0.083333,4,2020,3,8,21,1,4,1,110.539963,11.0,5.5,11.666667,17.25,21.8,38.888889,46.220000,48.476262,60.234290,59.901487,73.232491,46.705633,1,43.744792,50.620370,2,6,0.333333,0,0,0.0,1,1,1.00,1,1,1.00,1,1,1.00,0,1,0.0,1,1,1.00,192,300,0.640000,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,297842,413500,0.720295,2410,3450,0.698551,1,0,1,0,1,1,0,0.360000,1,0.643333,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.152849,-1.192515
2526698,7441,A040165003,1,8836,040165,4,165,3,0.166667,4,2020,3,8,21,1,4,1,110.539963,46.0,28.5,19.000000,20.25,23.0,38.888889,34.720000,48.476262,60.234290,59.901487,73.933021,46.705633,1,35.144068,33.156250,3,7,0.428571,0,0,0.0,2,2,1.00,2,2,1.00,2,2,1.00,1,1,1.0,2,2,1.00,236,300,0.786667,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,285143,414250,0.688336,2410,3450,0.698551,1,0,1,0,1,1,0,0.213333,1,0.640000,0.360000,1,0.643333,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.152849,-1.313112


In [9]:
final_data.to_parquet('/data/ephemeral/data/Feature_Selection.parquet')