In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

def concat_and_export(train_fe, test_fe):
    
    train_fe['kind']='train'
    test_fe['kind']='test'
    
    df = pd.concat([train_fe,test_fe])
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test.pkl'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
def export(df):
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test.pkl'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")
sub = pd.read_csv(f"{path}/sample_submission.csv")

day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    # 유저별 문제푼 시간, solvesec_3600, time_category
    df2['Timestamp2'] = pd.to_datetime(df2.Timestamp)
    df2['solvetime'] = df2.groupby('userID')['Timestamp2'].diff().fillna(pd.Timedelta(seconds=0))
    df2['solvesec'] = df2.solvetime.map(lambda x : x.total_seconds())
    df2['solvesec_3600'] = df2.solvesec
    df2.loc[df2.solvesec>=3600,'solvesec_3600']=3600

    df2['time_category'] = ''
    tc = [0,5,7,10,60,600,1200,2400,3600]
    df2.loc[(df2.solvesec==0), 'time_category'] = "0 - [0,0]"
    for i in range(len(tc)-1):
        s,e = tc[i],tc[i+1]
        df2.loc[(df2.solvesec>s) & (df2.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    df2.loc[(df2.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
    timecat2idx={k:v for v,k in enumerate(sorted(df2.time_category.unique()))}
    df2['time_category'] = df2.time_category.map(timecat2idx)
    
    
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    
    return df2

train_fe = feature_engineering(train)
test_fe = feature_engineering(test)
train_fe.shape, test_fe.shape

((2266586, 22), (260114, 22))

In [29]:
train_fe['kind']='train'
test_fe['kind']='test'
df = pd.concat([train_fe,test_fe])

In [42]:
df[df.userID==7331]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,test_sum,tag_mean,tag_std,tag_sum,kind,Timestamp2,solvetime,solvesec,solvesec_3600,time_category
2524767,7331,A060130001,A060000130,1,2020-10-05 11:33:34,714,,0,,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:33:34,0 days 00:00:00,0.0,0.0,0
2524768,7331,A060130002,A060000130,0,2020-10-05 11:35:06,714,1.0,1,1.0,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:35:06,0 days 00:01:32,92.0,92.0,5
2524769,7331,A060130003,A060000130,1,2020-10-05 11:38:51,714,1.0,2,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:38:51,0 days 00:03:45,225.0,225.0,5
2524770,7331,A060130004,A060000130,0,2020-10-05 11:42:05,714,2.0,3,0.666667,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:42:05,0 days 00:03:14,194.0,194.0,5
2524771,7331,A060130005,A060000130,0,2020-10-05 11:53:15,714,2.0,4,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:53:15,0 days 00:11:10,670.0,670.0,6
2524772,7331,A060130006,A060000130,1,2020-10-05 12:02:13,714,2.0,5,0.4,10,...,745,0.487421,0.499901,2073,train,2020-10-05 12:02:13,0 days 00:08:58,538.0,538.0,5
2524773,7331,A060130007,A060000130,1,2020-10-05 12:13:36,714,3.0,6,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 12:13:36,0 days 00:11:23,683.0,683.0,6
2524774,7331,A020118001,A020000118,1,2020-10-25 03:02:10,8014,4.0,7,0.571429,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:02:10,19 days 14:48:34,1694914.0,3600.0,9
2524775,7331,A020118002,A020000118,1,2020-10-25 03:02:18,8014,5.0,8,0.625,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:02:18,0 days 00:00:08,8.0,8.0,3
2524776,7331,A020118003,A020000118,0,2020-10-25 03:03:31,8014,6.0,9,0.666667,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:03:31,0 days 00:01:13,73.0,73.0,5


## 전체 데이터

### 문제푼시간 초단위 측정 및 내보내기

In [30]:
df = df.sort_values(['userID','Timestamp'])

df.reset_index(drop=True,inplace=True)

df['Timestamp2'] = pd.to_datetime(df.Timestamp)
df['solvetime'] = df.groupby('userID')['Timestamp2'].diff().fillna(pd.Timedelta(seconds=0))
df['solvesec'] = df.solvetime.map(lambda x : x.total_seconds())

df.solvesec.describe()

count    2.526700e+06
mean     5.263962e+04
std      4.053399e+05
min      0.000000e+00
25%      1.300000e+01
50%      3.700000e+01
75%      1.350000e+02
max      2.560230e+07
Name: solvesec, dtype: float64

In [40]:
df[df.userID==7331]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,test_sum,tag_mean,tag_std,tag_sum,kind,Timestamp2,solvetime,solvesec,solvesec_3600,time_category
2524767,7331,A060130001,A060000130,1,2020-10-05 11:33:34,714,,0,,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:33:34,0 days 00:00:00,0.0,0.0,0
2524768,7331,A060130002,A060000130,0,2020-10-05 11:35:06,714,1.0,1,1.0,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:35:06,0 days 00:01:32,92.0,92.0,5
2524769,7331,A060130003,A060000130,1,2020-10-05 11:38:51,714,1.0,2,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:38:51,0 days 00:03:45,225.0,225.0,5
2524770,7331,A060130004,A060000130,0,2020-10-05 11:42:05,714,2.0,3,0.666667,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:42:05,0 days 00:03:14,194.0,194.0,5
2524771,7331,A060130005,A060000130,0,2020-10-05 11:53:15,714,2.0,4,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:53:15,0 days 00:11:10,670.0,670.0,6
2524772,7331,A060130006,A060000130,1,2020-10-05 12:02:13,714,2.0,5,0.4,10,...,745,0.487421,0.499901,2073,train,2020-10-05 12:02:13,0 days 00:08:58,538.0,538.0,5
2524773,7331,A060130007,A060000130,1,2020-10-05 12:13:36,714,3.0,6,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 12:13:36,0 days 00:11:23,683.0,683.0,6
2524774,7331,A020118001,A020000118,1,2020-10-25 03:02:10,8014,4.0,7,0.571429,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:02:10,19 days 14:48:34,1694914.0,3600.0,9
2524775,7331,A020118002,A020000118,1,2020-10-25 03:02:18,8014,5.0,8,0.625,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:02:18,0 days 00:00:08,8.0,8.0,3
2524776,7331,A020118003,A020000118,0,2020-10-25 03:03:31,8014,6.0,9,0.666667,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:03:31,0 days 00:01:13,73.0,73.0,5


In [31]:
df['solvesec_3600'] = df.solvesec
df.loc[df.solvesec>=3600,'solvesec_3600']=3600

In [32]:
# 문제푼 시간 초단위 측정 내보내기
df2 = df.drop(['Timestamp2','solvetime'],axis=1)
export(df2)

Write: /opt/ml/input/data/after_fe_train_test.pkl


### 4800초 이상 변환

In [7]:
df[df.solvesec>=4800].shape

(357230, 26)

In [8]:
df.loc[df.solvesec>=4800,'solvesec']=4800

In [9]:
# 문제푼 시간 초단위 측정 내보내기
df2 = df.drop(['Timestamp2','solvetime'],axis=1)
export(df2)

Write: /opt/ml/input/data/after_fe_train_test.pkl


## 3600초 이상 변환

In [10]:
df.loc[df.solvesec>=3600,'solvesec']=3600

In [11]:
# 문제푼 시간 초단위 측정 내보내기
df2 = df.drop(['Timestamp2','solvetime'],axis=1)
export(df2)

Write: /opt/ml/input/data/after_fe_train_test.pkl


In [238]:
df.solvesec.describe()

count    2.526700e+06
mean     5.912752e+02
std      1.252786e+03
min      0.000000e+00
25%      1.300000e+01
50%      3.700000e+01
75%      1.350000e+02
max      3.600000e+03
Name: solvesec, dtype: float64

### time category 나누기

In [33]:
df['time_category'] = ''

tc = [0,5,7,10,60,600,1200,2400,3600]
df.loc[(df.solvesec==0), 'time_category'] = "0 - [0,0]"
print(0)
for i in range(len(tc)-1):
    s,e = tc[i],tc[i+1]
    df.loc[(df.solvesec>s) & (df.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    print(s,e)
df.loc[(df.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
print(e)

0
0 5
5 7
7 10
10 60
60 600
600 1200
1200 2400
2400 3600
3600


In [34]:
time_grp = df.groupby('time_category')['answerCode'].agg(['mean','count'])
time_grp['C'] = time_grp.index.str.extract(r'(\d+)').astype(int)[0].tolist()
time_grp.sort_values('C')[['mean','count']]

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - [0,0]",0.57639,15722
"1 - (0, 5]",0.246009,364747
"2 - (5, 7]",0.491269,69176
"3 - (7, 10]",0.664348,102204
"4 - (10, 60]",0.751549,1008903
"5 - (60, 600]",0.702995,552226
"6 - (600, 1200]",0.682187,27384
"7 - (1200, 2400]",0.69731,16763
"8 - (2400, 3600]",0.716402,7560
"9 - (3600, )",0.745632,362015


In [35]:
timecat2idx={k:v for v,k in enumerate(sorted(df.time_category.unique()))}
df['time_category'] = df.time_category.map(timecat2idx)
df.time_category.value_counts()

4    1008903
5     552226
1     364747
9     362015
3     102204
2      69176
6      27384
7      16763
0      15722
8       7560
Name: time_category, dtype: int64

In [39]:
df[:10]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,test_sum,tag_mean,tag_std,tag_sum,kind,Timestamp2,solvetime,solvesec,solvesec_3600,time_category
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,1268,0.955022,0.20741,637,train,2020-03-24 00:17:11,0 days 00:00:00,0.0,0.0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3,...,1268,0.913187,0.281603,3040,train,2020-03-24 00:17:14,0 days 00:00:03,3.0,3.0,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,3,...,1268,0.913187,0.281603,3040,train,2020-03-24 00:17:22,0 days 00:00:08,8.0,8.0,3
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,3,...,1268,0.913187,0.281603,3040,train,2020-03-24 00:17:29,0 days 00:00:07,7.0,7.0,2
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,3,...,1268,0.913187,0.281603,3040,train,2020-03-24 00:17:36,0 days 00:00:07,7.0,7.0,2
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,5.0,5,1.0,3,...,1268,0.913187,0.281603,3040,train,2020-03-24 00:17:47,0 days 00:00:11,11.0,11.0,4
6,0,A060003001,A060000003,0,2020-03-26 05:52:03,7226,6.0,6,1.0,3,...,1223,0.799552,0.40038,3570,train,2020-03-26 05:52:03,2 days 05:34:16,192856.0,3600.0,9
7,0,A060003002,A060000003,1,2020-03-26 05:52:10,7226,6.0,7,0.857143,3,...,1223,0.799552,0.40038,3570,train,2020-03-26 05:52:10,0 days 00:00:07,7.0,7.0,2
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226,7.0,8,0.875,3,...,1223,0.799552,0.40038,3570,train,2020-03-26 05:53:14,0 days 00:01:04,64.0,64.0,5
9,0,A060003004,A060000003,1,2020-03-26 05:53:29,7226,8.0,9,0.888889,3,...,1223,0.799552,0.40038,3570,train,2020-03-26 05:53:29,0 days 00:00:15,15.0,15.0,4


In [37]:
df2 = df.drop(['Timestamp2','solvetime'],axis=1)

export(df2)

## 훈련 데이터

### 문제푼시간 초단위 측정

In [19]:
time_df = train.sort_values(['userID','Timestamp'])[['userID','Timestamp']]

time_df.reset_index(drop=True,inplace=True)

time_df['Timestamp'] = pd.to_datetime(time_df.Timestamp)

time_df['solvetime'] = time_df.groupby('userID')['Timestamp'].diff().fillna(pd.Timedelta(seconds=0))

time_df['solvetime_sec'] = time_df.solvetime.map(lambda x : x.total_seconds())

time_df.solvetime_sec.describe()

count    2.266586e+06
mean     5.268592e+04
std      4.046780e+05
min      0.000000e+00
25%      1.300000e+01
50%      3.700000e+01
75%      1.350000e+02
max      2.560230e+07
Name: solvetime_sec, dtype: float64

In [20]:
time_df[:3]

Unnamed: 0,userID,Timestamp,solvetime,solvetime_sec
0,0,2020-03-24 00:17:11,0 days 00:00:00,0.0
1,0,2020-03-24 00:17:14,0 days 00:00:03,3.0
2,0,2020-03-24 00:17:22,0 days 00:00:08,8.0


### 문제 푼 시간 time category 나누기

In [195]:
time_df['time_category'] = ''

tc = [0,3,5,7,10,30,60,300,600,1200,2400,3600,4800,9600]
time_df.loc[(time_df.solvetime_sec==0), 'time_category'] = "0 - [0,0]"
print(0)
for i in range(len(tc)-1):
    s,e = tc[i],tc[i+1]
    time_df.loc[(time_df.solvetime_sec>s) & (time_df.solvetime_sec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    print(s,e)
time_df.loc[(time_df.solvetime_sec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
print(e)

# time_df['time_category'] = time_df['time_category'].astype("category")

train.Timestamp = pd.to_datetime(train.Timestamp)
train_time = train.merge(time_df, on=['userID','Timestamp'])

0
0 3
3 5
5 7
7 10
10 30
30 60
60 300
300 600
600 1200
1200 2400
2400 3600
3600 4800
4800 9600
9600


In [23]:
df[df.answerCode==-1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,test_std,test_sum,tag_mean,tag_std,tag_sum,kind,Timestamp2,solvetime,solvesec,time_category
2989,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,717.0,1035,0.692754,10,...,0.490209,90,0.542662,0.505845,159,test,2020-10-26 13:13:57,0 days 00:00:46,46.0,4
3660,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,465.0,670,0.694030,12,...,0.539601,77,0.565693,0.552442,155,test,2020-12-27 02:47:54,0 days 00:00:23,23.0,4
10860,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,915.0,1316,0.695289,12,...,0.501291,117,0.446753,0.518307,172,test,2020-12-27 04:35:09,0 days 00:00:08,8.0,3
15278,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1031.0,1259,0.818904,10,...,0.530957,30,0.514286,0.531415,36,test,2020-10-30 05:48:37,0 days 00:01:15,75.0,5
23531,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,293.0,386,0.759067,10,...,0.479048,133,0.602767,0.493836,305,test,2020-10-23 11:44:18,0 days 00:00:17,17.0,4
26895,29,A020190005,A020000190,-1,2020-10-22 04:38:45,8097,723.0,853,0.847597,10,...,0.503939,41,0.642424,0.517399,106,test,2020-10-22 04:38:45,0 days 00:00:30,30.0,4
39887,45,A040136005,A040000136,-1,2020-10-23 08:24:19,2107,746.0,1083,0.688827,10,...,0.512637,76,0.647059,0.489739,352,test,2020-10-23 08:24:19,0 days 00:00:04,4.0,1
47628,53,A040140005,A040000140,-1,2020-10-26 09:13:20,2110,362.0,692,0.523121,10,...,0.537362,102,0.665049,0.502492,274,test,2020-10-26 09:13:20,0 days 00:00:20,20.0,4
51927,58,A070159007,A070000159,-1,2020-12-24 21:09:29,9122,295.0,810,0.364198,12,...,0.514896,143,0.343478,0.544385,79,test,2020-12-24 21:09:29,0 days 00:00:02,2.0,1
57352,64,A070146008,A070000146,-1,2020-12-29 04:30:22,9080,1058.0,1269,0.833727,12,...,0.539601,77,0.565693,0.552442,155,test,2020-12-29 04:30:22,0 days 00:00:02,2.0,1


In [196]:
time_grp = train_time.groupby('time_category')['answerCode'].agg(['mean','count'])
time_grp['C'] = time_grp.index.str.extract(r'(\d+)').astype(int)[0].tolist()
time_grp.sort_values('C')[['mean','count']]

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - [0,0]",0.53268,21833
"1 - (0, 3]",0.233128,241953
"2 - (3, 5]",0.282027,87839
"3 - (5, 7]",0.490113,62404
"4 - (7, 10]",0.663017,91619
"5 - (10, 30]",0.759592,531122
"6 - (30, 60]",0.740142,376013
"7 - (60, 300]",0.705638,444813
"8 - (300, 600]",0.680505,52223
"9 - (600, 1200]",0.681578,24769


In [193]:
time_grp = train_time.groupby('time_category')['answerCode'].agg(['mean','count'])
time_grp['C'] = time_grp.index.str.extract(r'(\d+)').astype(int)[0].tolist()
time_grp.sort_values('C')[['mean','count']]

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - 0, 3",0.269412,187341
"1 - 3, 5",0.237244,126916
"2 - 5, 7",0.378013,69331
"3 - 7, 10",0.610907,91058
"4 - 10, 30",0.756369,542242
"5 - 30, 60",0.741391,387770
"6 - 60, 300",0.705877,452573
"7 - 300, 600",0.680625,52512
"8 - 600, 1200",0.681533,24822
"9 - 1200, 2400",0.69751,15022


In [159]:
train_time.groupby('time_category')['answerCode'].agg(['mean','count'])

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - 0, 3",0.269412,187341
"1 - 3, 5",0.237244,126916
"10 - 2400, 3600",0.718965,6839
"11 - 3600, 4800",0.730653,4303
"12 - 4800, 9600",0.743859,10096
13 - 9600~,0.745662,310889
"2 - 5, 7",0.378013,69331
"3 - 7, 10",0.610907,91058
"4 - 10, 30",0.756369,542242
"5 - 30, 60",0.741391,387770


In [142]:
train_time.groupby('time_category')['answerCode'].agg(['mean','count'])

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - 0, 5",0.256421,314257
"1 - 5, 30",0.700184,702631
10 - 9600~,0.745662,310889
"2 - 30, 60",0.741391,387770
"3 - 60, 300",0.705877,452573
"4 - 300, 600",0.680625,52512
"5 - 600, 1200",0.681533,24822
"6 - 1200, 2400",0.69751,15022
"7 - 2400, 3600",0.718965,6839
"8 - 3600, 4800",0.730653,4303


In [145]:
train_time[train_time.solvetime_sec==0].answerCode.mean()

0.5326798882425686

In [140]:
train_time.groupby('time_category')['answerCode'].agg(['mean','count'])

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - 0, 5",0.256421,314257
"1 - 5, 30",0.700184,702631
"2 - 30, 60",0.741391,387770
"3 - 60, 300",0.705877,452573
"4 - 300, 600",0.680625,52512
"5 - 600, 1200",0.681533,24822
"6 - 1200, 2400",0.69751,15022
7 - 2400~,0.744863,332127
