# Imports

In [573]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Read Data

In [574]:
df_train = pd.read_csv('X_train.csv', encoding='cp949')
df_test = pd.read_csv('X_test.csv', encoding='cp949')
y_train = pd.read_csv('y_train.csv').gender
IDtest = df_test.custid.unique()

In [575]:
df_train.head()

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,90000,9000,81000,3,0
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,39000,3900,35100,1,0
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,화장품,1,175000,17500,157500,3,0
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,수입명품,1,455000,45500,409500,3,0
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,100000,10000,90000,3,0


#### 판매데이터 구조
- custid: 고객ID
- sales_date: 판매일자 (문자)
- str_nm: 판매시간 (4자리 숫자)
- goodcd: 상품코드
- brd_nm: 브랜드명
- corner_nm: 코너명
- pc_nm: PC명
- part_nm: 파트명
- team_nm: 판매팀명
- buyer_nm: Buying MD
- import_flg: 수입상품여부
- tot_amt: 판매금액
- dis_amt: 할인금액
- net_amt: 판매금액 - 할인금액
- inst_mon: 할부개월수

<p>*`백화점 영업조직`: 306개 corner < 76개 pc < 29개 part < 3개 team*</p>
<p>*`y_train.csv`에서 target값 0은 남자, 1은 여자를 나타냄*</p>

# Pivoting

## 1. 고객별 구매시간빈도

In [576]:
def g(x):
    if len(str(x))==4:
        return(str(x)[0:2])
    else:
        return(str(x)[0:1])

df_train['sales_hour'] = df_train['sales_time'].apply(g)
df_test['sales_hour'] = df_test['sales_time'].apply(g)

In [577]:
##

df_all = pd.concat([df_train, df_test])

In [578]:
level = 'sales_hour'
df_train[level].nunique()

19

In [579]:
df_train[level].unique()

array(['12', '18', '19', '14', '15', '16', '17', '10', '11', '20', '13',
       '9', '21', '22', '1', '23', '3', '2', '8'], dtype=object)

In [580]:
X_train1 = pd.pivot_table(df_all, index='custid', columns=level,values='tot_amt',
                         aggfunc=np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test1 = pd.pivot_table(df_all, index='custid', columns=level,values='tot_amt',
                         aggfunc=np.size , fill_value=0). \
                         reset_index(). \
                         query('custid  in @IDtest'). \
                         drop(columns=['custid']).values


In [581]:
X_train1 = pd.DataFrame(X_train1)
X_test1 = pd.DataFrame(X_test1)

# 2. 고객별 브랜드빈도

In [582]:
level = 'brd_nm'
df_train[level].nunique()

1882

In [583]:
X_train2 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test2 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train2 = pd.DataFrame(X_train2)
X_test2 = pd.DataFrame(X_test2)

# 3. 고객별  코너방문 빈도

In [584]:
level = 'corner_nm'
df_train[level].nunique()

309

In [585]:
X_train3 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test3 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train3 = pd.DataFrame(X_train3)
X_test3 = pd.DataFrame(X_test3)

# 4. 고객별 PC빈도

In [586]:
level = 'pc_nm'
df_train[level].nunique()

78

In [587]:
X_train4 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test4 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train4 = pd.DataFrame(X_train4)
X_test4 = pd.DataFrame(X_test4)

# 5. 고객별  파트빈도

In [588]:
level = 'part_nm'
df_train[level].nunique()

31

In [589]:
X_train5 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test5 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train5 = pd.DataFrame(X_train5)
X_test5 = pd.DataFrame(X_test5)

# 6. 고객별  판매팀명

In [590]:
level = 'team_nm'
df_train[level].nunique()

5

In [591]:
X_train6 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test6 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=np.size , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train6 = pd.DataFrame(X_train6)
X_test6 = pd.DataFrame(X_test6)

# 7. 고객별 Buying MD 빈도

In [592]:
level = 'buyer_nm'
df_train[level].nunique()

35

In [593]:
X_train7 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test7 = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc= np.size , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train7 = pd.DataFrame(X_train7)
X_test7 = pd.DataFrame(X_test7)

# 8. 고객별 총구매액

In [594]:
X_train8 = pd.pivot_table(df_all, index='custid', values='tot_amt',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test8 = pd.pivot_table(df_all, index='custid', values='tot_amt',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train8 = pd.DataFrame(X_train8)
X_test8 = pd.DataFrame(X_test8)

# 9. 고객별 평균 구매액

In [595]:
X_train9 = pd.pivot_table(df_all, index='custid', values='tot_amt',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test9 = pd.pivot_table(df_all, index='custid', values='tot_amt',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train9 = pd.DataFrame(X_train9)
X_test9 = pd.DataFrame(X_test9)

# 10. 고객별 총 할인금액

In [596]:
X_train10 = pd.pivot_table(df_all, index='custid', values='dis_amt',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test10 = pd.pivot_table(df_all, index='custid', values='dis_amt',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train10 = pd.DataFrame(X_train10)
X_test10 = pd.DataFrame(X_test10)

# 11. 고객별 평균 할인금액

In [597]:
X_train11 = pd.pivot_table(df_all, index='custid', values='dis_amt',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test11 = pd.pivot_table(df_all, index='custid', values='dis_amt',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train11 = pd.DataFrame(X_train11)
X_test11 = pd.DataFrame(X_test11)

# 12. 고객별 총 직불금액

In [598]:
X_train12 = pd.pivot_table(df_all, index='custid', values='net_amt',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test12 = pd.pivot_table(df_all, index='custid', values='net_amt',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train12 = pd.DataFrame(X_train12)
X_test12 = pd.DataFrame(X_test12)

# 13. 고객별 평균 직불금액

In [599]:
X_train13 = pd.pivot_table(df_all, index='custid', values='net_amt',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test13 = pd.pivot_table(df_all, index='custid', values='net_amt',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train13 = pd.DataFrame(X_train13)
X_test13 = pd.DataFrame(X_test13)

# 14. 고객별 총 할부개월 수

In [600]:
X_train14 = pd.pivot_table(df_all, index='custid', values='inst_mon',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test14 = pd.pivot_table(df_all, index='custid', values='inst_mon',
                         aggfunc=np.sum , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train14 = pd.DataFrame(X_train14)
X_test14 = pd.DataFrame(X_test14)

# 15. 고객별 평균 할부개월 수

In [601]:
X_train15 = pd.pivot_table(df_all, index='custid', values='inst_mon',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values

X_test15 = pd.pivot_table(df_all, index='custid', values='inst_mon',
                         aggfunc=np.mean , fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

X_train15 = pd.DataFrame(X_train15)
X_test15 = pd.DataFrame(X_test15)

# Merging

In [602]:
X_train = pd.concat([X_train1,X_train2],axis=1)
X_train = pd.concat([X_train,X_train3],axis=1)
X_train = pd.concat([X_train,X_train4],axis=1)
X_train = pd.concat([X_train,X_train5],axis=1)
X_train = pd.concat([X_train,X_train6],axis=1)
X_train = pd.concat([X_train,X_train7],axis=1)
X_train = pd.concat([X_train,X_train8],axis=1)
X_train = pd.concat([X_train,X_train9],axis=1)
X_train = pd.concat([X_train,X_train10],axis=1)
X_train = pd.concat([X_train,X_train11],axis=1)
X_train = pd.concat([X_train,X_train12],axis=1)
X_train = pd.concat([X_train,X_train13],axis=1)
X_train = pd.concat([X_train,X_train14],axis=1)
X_train = pd.concat([X_train,X_train15],axis=1)


X_test = pd.concat([X_test1,X_test2],axis=1)
X_test = pd.concat([X_test,X_test3],axis=1)
X_test = pd.concat([X_test,X_test4],axis=1)
X_test = pd.concat([X_test,X_test5],axis=1)
X_test = pd.concat([X_test,X_test6],axis=1)
X_test = pd.concat([X_test,X_test7],axis=1)
X_test = pd.concat([X_test,X_test8],axis=1)
X_test = pd.concat([X_test,X_test9],axis=1)
X_test = pd.concat([X_test,X_test10],axis=1)
X_test = pd.concat([X_test,X_test11],axis=1)
X_test = pd.concat([X_test,X_test12],axis=1)
X_test = pd.concat([X_test,X_test13],axis=1)
X_test = pd.concat([X_test,X_test14],axis=1)
X_test = pd.concat([X_test,X_test15],axis=1)

In [603]:
X_train.shape

(30000, 2392)

In [604]:
X_test.shape

(19995, 2392)

In [605]:
X_train.columns = list(range(0,2392))
X_test.columns = list(range(0,2392))

In [606]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2382,2383,2384,2385,2386,2387,2388,2389,2390,2391
0,0,0,0,0,2,0,0,0,0,0,...,0,8,1742000,158363.636364,174200,15836.363636,1567800,142527.272727,31,2.818182
1,0,0,1,3,3,0,1,4,6,3,...,0,3,2772100,106619.230769,56630,2178.076923,2715470,104441.153846,64,2.461538
2,0,0,2,1,0,1,1,0,1,1,...,0,1,3750850,340986.363636,255090,23190.0,3495760,317796.363636,38,3.454545
3,0,0,0,3,3,2,3,0,1,5,...,0,1,2300500,76683.333333,91660,3055.333333,2208840,73628.0,80,2.666667
4,0,0,0,0,0,0,4,0,0,0,...,0,1,1045000,261250.0,21800,5450.0,1023200,255800.0,18,4.5


In [607]:
X_train.shape

(30000, 2392)

# 0517_Merging-Yuna

In [608]:
features = []

## 1. 평균 구매시간

In [609]:
df_all['hour']=df_all['sales_time']//100

In [610]:
f = df_all.groupby('custid')['hour'].agg([('평균구매시간', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f); f

Unnamed: 0,custid,평균구매시간
0,0,17.4
1,1,15.1
2,2,15.1
3,3,15.8
4,4,14.0
5,5,15.5
6,6,15.6
7,7,16.2
8,8,15.6
9,9,15.4


## 2. 퇴근 시간 관련 구매건수

In [611]:
a=pd.pivot_table(df_all, index='custid', columns='hour', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()

In [612]:
aa=a.iloc[:,[1,2,3,4,5,6,7,8,10,11,12,13]].sum(axis=1)
ab=a.iloc[:,[14,15,16,17,18]].sum(axis=1)
ac=a.iloc[:,[1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18]].sum(axis=1)
ap=pd.DataFrame({'custid':a['custid'],'퇴근 전 구매건수':aa,'퇴근 후 구매건수':ab,'총 구매건수':ac})
ap['근무시간 구매건수 비율']=ap['퇴근 전 구매건수']/ap['총 구매건수'].fillna(0)
ap=ap.fillna(0)
ap['퇴근 후 구매건수 비율']=ap['퇴근 후 구매건수']/ap['총 구매건수'].fillna(0)
ap=ap.fillna(0)
f=ap.iloc[:,[0,1,2,4,5]]
features.append(f); f

Unnamed: 0,custid,퇴근 전 구매건수,퇴근 후 구매건수,근무시간 구매건수 비율,퇴근 후 구매건수 비율
0,0,6,5,0.545455,0.454545
1,1,24,1,0.960000,0.040000
2,2,7,3,0.700000,0.300000
3,3,25,2,0.925926,0.074074
4,4,0,0,0.000000,0.000000
5,5,26,6,0.812500,0.187500
6,6,27,2,0.931034,0.068966
7,7,33,1,0.970588,0.029412
8,8,12,1,0.923077,0.076923
9,9,45,12,0.789474,0.210526


## 3. 충동구매 여부

In [613]:
f1 = df_all.groupby('custid')['hour'].agg([('평균구매시간', 'mean')]).reset_index()
f1.iloc[:,1] = f1.iloc[:,1].apply(round, args=(0,))
f2=df_all.groupby(['custid','hour'])['tot_amt'].agg([('구매건수','size')]).reset_index()
f3=pd.merge(f1,f2,on='custid')
f3['rate']=f3['평균구매시간']/f3['hour']

In [614]:
m1=f3.query('0<=rate<=1').groupby('custid')['구매건수'].agg([('평균 구매시간 내 구매건수','size')]).reset_index()
m2=f3.query('1<=rate').groupby('custid')['구매건수'].agg([('평균 구매시간 외 구매건수','size')]).reset_index()
m=pd.merge(m1,m2,on='custid')
m['평균 시간 구매비율']=m['평균 구매시간 내 구매건수']/m['평균 구매시간 외 구매건수']
m['충동구매 여부']=np.where(m['평균 시간 구매비율']<=1,'no','yes')
f = pd.get_dummies(m, columns=['충동구매 여부'])  # This method performs One-hot-encoding
features.append(f); f

Unnamed: 0,custid,평균 구매시간 내 구매건수,평균 구매시간 외 구매건수,평균 시간 구매비율,충동구매 여부_no,충동구매 여부_yes
0,0,2,1,2.000000,0,1
1,1,5,5,1.000000,1,0
2,2,4,4,1.000000,1,0
3,3,4,5,0.800000,1,0
4,4,1,1,1.000000,1,0
5,5,4,5,0.800000,1,0
6,6,4,5,0.800000,1,0
7,7,4,4,1.000000,1,0
8,8,4,4,1.000000,1,0
9,9,5,5,1.000000,1,0


## 4. 충동구매 건수

In [615]:
f=f3.query('1<=rate').groupby('custid')['구매건수'].agg([('충동구매 건수','sum')]).reset_index()
f

Unnamed: 0,custid,충동구매 건수
0,0,2
1,1,12
2,2,5
3,3,12
4,4,4
5,5,15
6,6,19
7,7,17
8,8,12
9,9,29


## 5. 여성용,남성용 제품 중 여성용품 구매비율

In [616]:
a=pd.pivot_table(df_all, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
women_brand = ['여성캐주얼','여성의류파트','여성정장','여성캐쥬얼']
man_bran = ['남성정장스포츠','남성의류']
f['여성용품 구매 비율']=a.loc[:,women_brand].sum(axis=1)/len(women_brand+man_bran)
f=f.iloc[:,[0,2]]
features.append(f); f

Unnamed: 0,custid,여성용품 구매 비율
0,0,0.000000
1,1,0.666667
2,2,0.000000
3,3,0.000000
4,4,0.500000
5,5,0.500000
6,6,0.666667
7,7,0.500000
8,8,1.500000
9,9,1.166667


## 6. 여성용,남성용 제품 중 남성용품 구매비율

In [617]:
a=pd.pivot_table(df_all, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
women_brand = ['여성캐주얼','여성의류파트','여성정장','여성캐쥬얼']
man_bran = ['남성정장스포츠','남성의류']
f['남성용품 구매 비율']=a.loc[:,man_bran].sum(axis=1)/len(women_brand+man_bran)
f=f.iloc[:,[0,2]]
features.append(f); f

Unnamed: 0,custid,남성용품 구매 비율
0,0,0.166667
1,1,0.166667
2,2,0.166667
3,3,0.500000
4,4,0.000000
5,5,1.333333
6,6,0.333333
7,7,0.000000
8,8,0.000000
9,9,0.000000


## 7. 총구매건수

In [618]:
f = df_all.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
features.append(f); f

Unnamed: 0,custid,구매건수
0,0,11
1,1,26
2,2,11
3,3,30
4,4,4
5,5,32
6,6,31
7,7,35
8,8,18
9,9,59


## 8. 구매다양성

In [619]:
n = df_all.corner_nm.nunique()
f = df_all.groupby('custid')['brd_nm'].agg([('구매상품다양성', lambda x: len(x.unique()) / n)]).reset_index()
features.append(f); f

Unnamed: 0,custid,구매상품다양성
0,0,0.022654
1,1,0.061489
2,2,0.022654
3,3,0.067961
4,4,0.012945
5,5,0.067961
6,6,0.074434
7,7,0.064725
8,8,0.042071
9,9,0.113269


## 9. 내점일수

In [620]:
df_all['sdate'] = df_all.sales_date.str[:10]
f = df_all.groupby(by = 'custid')['sdate'].agg([('내점일수','nunique')]).reset_index()
features.append(f); f

Unnamed: 0,custid,내점일수
0,0,7
1,1,16
2,2,7
3,3,13
4,4,2
5,5,21
6,6,11
7,7,23
8,8,10
9,9,34


## 10.수입상품 구매비율

In [621]:
x = df_all[df_all['import_flg'] == 1].groupby('custid').size() / df_all.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품_구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1]*100).apply(round, args=(1,))
features.append(f); f

Unnamed: 0,custid,수입상품_구매비율
0,0,63.6
1,1,42.3
2,2,9.1
3,3,0.0
4,4,25.0
5,5,18.8
6,6,9.7
7,7,8.6
8,8,11.1
9,9,10.2


## 11. 주말방문 비율

In [622]:
def fw(x):
    k = x.dayofweek
    if k <= 4 :
        return('주중_방문')
    else :
        return('주말_방문')    
    
df = df_all.copy()
df = df.drop_duplicates(['custid','sales_date'])

df['week'] = pd.to_datetime(df.sales_date).apply(fw)
df = pd.pivot_table(df, index='custid', columns='week', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
df['주말방문비율'] = ((df.iloc[:,1] / (df.iloc[:,1]+df.iloc[:,2]))*100).apply(round, args=(1,))
f = df.copy().iloc[:,[0,-1]]
features.append(f); f

week,custid,주말방문비율
0,0,42.9
1,1,50.0
2,2,28.6
3,3,38.5
4,4,50.0
5,5,23.8
6,6,36.4
7,7,26.1
8,8,70.0
9,9,14.7


## 12.계절별 구매건수

In [623]:
def f1(x):
    k = x.month
    if 3 <= k <= 5 :
        return('봄-구매건수')
    elif 6 <= k <= 8 :
        return('여름-구매건수')
    elif 9 <= k <= 11 :    
        return('가을-구매건수')
    else :
        return('겨울-구매건수')    
    
df_all['season'] = pd.to_datetime(df_all.sales_date).apply(f1)
f = pd.pivot_table(df_all, index='custid', columns='season', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

season,custid,가을-구매건수,겨울-구매건수,봄-구매건수,여름-구매건수
0,0,3,3,1,4
1,1,6,1,10,9
2,2,5,3,0,3
3,3,3,6,9,12
4,4,0,0,0,4
5,5,12,1,6,13
6,6,14,2,9,6
7,7,7,12,1,15
8,8,4,2,2,10
9,9,18,13,11,17


## 13. 시간대별 구매건수

In [624]:
def f2(x):
    if 9 <= x <= 12 :
        return('아침_구매건수')
    elif 13 <= x <= 17 :
        return('점심_구매건수')
    else :
        return('저녁_구매건수')  # datatime 필드가 시간 형식에 맞지 않은 값을 갖는 경우 저녁시간으로 처리

df_all['timeslot'] = (df_all.sales_time // 100).apply(f2)
f = pd.pivot_table(df_all, index='custid', columns='timeslot', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

timeslot,custid,아침_구매건수,저녁_구매건수,점심_구매건수
0,0,2,9,0
1,1,7,5,14
2,2,3,4,4
3,3,6,13,11
4,4,0,0,4
5,5,9,12,11
6,6,4,8,19
7,7,0,6,29
8,8,1,3,14
9,9,19,24,16


## 14. 방문 빈도성

In [625]:
df_all['month']=pd.to_datetime(df_all.sales_date).dt.month
a=df_all.groupby('custid')['month'].agg([('가장 최근 구매 달','max')]).reset_index()
b=df_all.groupby('custid')['month'].agg([('가장 예전 구매 달','min')]).reset_index()
c=df_all.groupby('custid')['month'].agg([('구매를 한 달의 수','nunique')]).reset_index()
a['방문빈도성']=round((a['가장 최근 구매 달']-b['가장 예전 구매 달'])/c['구매를 한 달의 수'],1)
f=a.iloc[:,[0,2]]
features.append(f); f

Unnamed: 0,custid,방문빈도성
0,0,1.6
1,1,1.4
2,2,1.8
3,3,1.2
4,4,0.0
5,5,1.0
6,6,1.6
7,7,1.1
8,8,1.0
9,9,1.0


## 15. 구매상품 다양성

In [626]:
f4=df_all.groupby('custid')['brd_nm'].agg([('구매상품 다양성','nunique')]).reset_index()
f4['구매상품 다양성']=f4['구매상품 다양성']/1191
f=round(f4,2)
features.append(f); f

Unnamed: 0,custid,구매상품 다양성
0,0,0.01
1,1,0.02
2,2,0.01
3,3,0.02
4,4,0.00
5,5,0.02
6,6,0.02
7,7,0.02
8,8,0.01
9,9,0.03


## 16. 최대 할부개월 수 

In [627]:
f=df_all.groupby('custid')['inst_mon'].agg([('최대할부개월수', 'max')]).reset_index()
features.append(f); f

Unnamed: 0,custid,최대할부개월수
0,0,3
1,1,3
2,2,12
3,3,5
4,4,10
5,5,3
6,6,3
7,7,3
8,8,3
9,9,1


## 17.브랜드 편중도 = 해당 고객이 구매한 브랜드 수 / 전체 브랜드 수

In [628]:
a['브랜드 편중도']=df_all.groupby('custid')['brd_nm'].agg([('구매 브랜드 수','nunique')]).reset_index().iloc[:,1]/df_all['brd_nm'].nunique()
f=a.iloc[:,[0,3]]
features.append(f); f

Unnamed: 0,custid,브랜드 편중도
0,0,0.003673
1,1,0.009969
2,2,0.003673
3,3,0.011018
4,4,0.002099
5,5,0.011018
6,6,0.012067
7,7,0.010493
8,8,0.006821
9,9,0.018363


## 18. 내점당 평균 구매건수

In [629]:
f = df_all.groupby('custid')['sales_date'].agg([('하','value_counts')]).reset_index()
f= f.groupby('custid')['하'].agg([('내점당 평균 구매건수','mean')]).reset_index()
f['내점당 평균 구매건수'] = round(f['내점당 평균 구매건수'],2)
features.append(f); f

Unnamed: 0,custid,내점당 평균 구매건수
0,0,1.57
1,1,1.62
2,2,1.57
3,3,2.31
4,4,2.00
5,5,1.52
6,6,2.82
7,7,1.52
8,8,1.80
9,9,1.74


## 19. 새로운 시간대별 구매건수

In [630]:
def f2(x):
    if 901 <= x < 1200 :
        return('12시 이전_구매건수')
    elif 1200 <= x < 1400 :
        return('12~2시_구매건수')
    elif 1400 <= x < 1600 :
        return('2~4시_구매건수')
    elif 1600 <= x < 1800 :
        return('4~6시_구매건수')
    else :
        return('6시이후_구매건수')  

df_all['timeslot2'] = df_all.sales_time.apply(f2)

f = pd.pivot_table(df_all, index='custid', columns='timeslot2', values='tot_amt',
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

timeslot2,custid,12~2시_구매건수,12시 이전_구매건수,2~4시_구매건수,4~6시_구매건수,6시이후_구매건수
0,0,2,0,0,0,9
1,1,3,4,5,9,5
2,2,1,3,1,2,4
3,3,5,3,3,6,13
4,4,0,0,4,0,0
5,5,3,8,1,8,12
6,6,8,0,5,10,8
7,7,3,0,9,17,6
8,8,1,0,9,5,3
9,9,21,2,6,6,24


## 20. 총 방문기간

In [631]:
df_all['sdate'] = df_all.sales_date.str[:10]
def visit_period(x) :
    max_date = pd.to_datetime(x.max()).to_julian_date()
    min_date = pd.to_datetime(x.min()).to_julian_date()
    return max_date - min_date

f = df_all.groupby('custid')['sdate'].agg([('총방문기간',visit_period)]).reset_index()
features.append(f); f

Unnamed: 0,custid,총방문기간
0,0,288.0
1,1,350.0
2,2,129.0
3,3,331.0
4,4,3.0
5,5,322.0
6,6,247.0
7,7,324.0
8,8,299.0
9,9,327.0


## 21. 평균 방문주기

In [632]:
#평균 방문주기
df_all['sdate'] = df_all.sales_date.str[:10]
def visit_cycle(x) :
    max_date = pd.to_datetime(x.max()).to_julian_date()
    min_date = pd.to_datetime(x.min()).to_julian_date()
    visit_count = x.count()
    return (max_date - min_date)/visit_count

f = df_all.groupby('custid')['sdate'].agg([('평균방문주기',visit_cycle)]).reset_index()
features.append(f); f

Unnamed: 0,custid,평균방문주기
0,0,26.181818
1,1,13.461538
2,2,11.727273
3,3,11.033333
4,4,0.750000
5,5,10.062500
6,6,7.967742
7,7,9.257143
8,8,16.611111
9,9,5.542373


# 22 평균 구매가격

In [633]:
f = df_all.groupby('custid')['tot_amt'].agg([('평균구매가격', 'mean')]).reset_index()
features.append(f); f

Unnamed: 0,custid,평균구매가격
0,0,158363.636364
1,1,106619.230769
2,2,340986.363636
3,3,76683.333333
4,4,261250.000000
5,5,157929.968750
6,6,122097.709677
7,7,34948.057143
8,8,70416.666667
9,9,84010.508475


# 23 평균 할부개월수

In [634]:
f = df_all.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f); f

Unnamed: 0,custid,평균할부개월수
0,0,2.8
1,1,2.5
2,2,3.5
3,3,2.7
4,4,4.5
5,5,1.9
6,6,1.8
7,7,1.4
8,8,2.1
9,9,1.0


# 24 구매상품 다양성 : 구매한 서로다른 브랜드 수

In [635]:
n = df_all.part_nm.nunique()
f = df_all.groupby('custid')['brd_nm'].agg([('구매상품다양성', lambda x: len(x.unique()) / n)]).reset_index()
features.append(f); f

Unnamed: 0,custid,구매상품다양성
0,0,0.225806
1,1,0.612903
2,2,0.225806
3,3,0.677419
4,4,0.129032
5,5,0.677419
6,6,0.741935
7,7,0.645161
8,8,0.419355
9,9,1.129032


# 25 이용지점 다양성 : 

In [636]:
n = 4
f = df_all.groupby('custid')['str_nm'].agg([('매장이용다양성', lambda x: len(x.unique()) / n)]).reset_index()
features.append(f); f

Unnamed: 0,custid,매장이용다양성
0,0,0.50
1,1,0.50
2,2,0.50
3,3,0.50
4,4,0.25
5,5,1.00
6,6,0.50
7,7,0.50
8,8,0.25
9,9,0.25


# 26 요일별 구매건수 - 요일을 새로운 기준으로 구분해봄

In [637]:
def f2(x):
    k = x.dayofweek
    if k <= 2 :
        return('월화수_구매건수')
    elif 3 <= k < 5 :
        return('목금_구매건수')
    elif 5 <= k < 6 :
        return('토_구매건수')
    else :
        return('일_구매건수')    
    
df_all['요일2'] = pd.to_datetime(df_all.sales_date).apply(f2)
f = pd.pivot_table(df_all, index='custid', columns='요일2', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

요일2,custid,목금_구매건수,월화수_구매건수,일_구매건수,토_구매건수
0,0,2,2,5,2
1,1,8,5,4,9
2,2,2,5,4,0
3,3,13,6,6,5
4,4,3,0,1,0
5,5,23,4,3,2
6,6,6,10,2,13
7,7,7,18,4,6
8,8,1,3,10,4
9,9,20,19,9,11


# 27 계절별 구매건수: 2,3,4월/ 5,6,7월 / 8,9,10월 / 11,12,1월 - 일반적인 분기보다 1개월 당김

In [638]:
def f1(x):
    k = x.month
    if 2 <= k <= 4 :
        return('234월_구매건수')
    elif 5 <= k <= 7 :
        return('567월_구매건수')
    elif 8 <= k <= 10 :
        return('8910월_구매건수')
    else :
        return('11121월_구매건수')    
    
df_all['season2'] = pd.to_datetime(df_all.sales_date).apply(f1)
f = pd.pivot_table(df_all, index='custid', columns='season2', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

season2,custid,11121월_구매건수,234월_구매건수,567월_구매건수,8910월_구매건수
0,0,3,1,2,5
1,1,5,5,14,2
2,2,4,0,0,7
3,3,6,7,10,7
4,4,0,0,4,0
5,5,3,4,11,14
6,6,12,0,9,10
7,7,11,3,8,13
8,8,3,2,9,4
9,9,14,11,12,22


# 28 평균할인금액

In [639]:
f = df_all.groupby('custid')['dis_amt'].agg([('평균할인금액', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f); f

Unnamed: 0,custid,평균할인금액
0,0,15836.4
1,1,2178.1
2,2,23190.0
3,3,3055.3
4,4,5450.0
5,5,11289.4
6,6,10164.5
7,7,626.6
8,8,1390.0
9,9,3624.6


# 29 실제 구매금액

In [640]:
f = df_all.groupby('custid')['net_amt'].agg([('실제구매금액', 'sum')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f); f

Unnamed: 0,custid,실제구매금액
0,0,1567800
1,1,2715470
2,2,3495760
3,3,2208840
4,4,1023200
5,5,4692499
6,6,3469929
7,7,1201252
8,8,1242480
9,9,4742770


# 30  실제구매금액 평균

In [641]:
f = df_all.groupby('custid')['net_amt'].agg([('실제구매금액평균', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f); f

Unnamed: 0,custid,실제구매금액평균
0,0,142527.3
1,1,104441.2
2,2,317796.4
3,3,73628.0
4,4,255800.0
5,5,146640.6
6,6,111933.2
7,7,34321.5
8,8,69026.7
9,9,80385.9


# 31 구입 지점 빈도값 도출

In [642]:
f = pd.pivot_table(df_all, index='custid', columns='str_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

str_nm,custid,무역점,본점,신촌점,천호점
0,0,5,6,0,0
1,1,4,22,0,0
2,2,8,0,0,3
3,3,4,0,0,26
4,4,4,0,0,0
5,5,3,24,4,1
6,6,0,26,5,0
7,7,0,7,28,0
8,8,18,0,0,0
9,9,59,0,0,0


# 32 총 구매액

In [643]:
f = df_all.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
features.append(f); f

Unnamed: 0,custid,총구매액
0,0,1742000
1,1,2772100
2,2,3750850
3,3,2300500
4,4,1045000
5,5,5053759
6,6,3785029
7,7,1223182
8,8,1267500
9,9,4956620


# 33 구매 파트 변수의 각 파트의 빈도값 도출

In [644]:
f = pd.pivot_table(df_all, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

part_nm,custid,가정용품,가정용품파트,골프/유니캐쥬얼,공산품,공산품파트,남성의류,남성정장스포츠,로얄부띠끄,로얄부틱,...,여성캐쥬얼,영라이브,영어덜트캐쥬얼,영캐릭터,영플라자,인터넷백화점,잡화,잡화파트,"케주얼,구두,아동",패션잡화
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,6,0,0
1,1,1,1,0,0,0,0,1,6,0,...,0,0,0,0,0,0,0,5,8,0
2,2,4,0,3,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,3,2,0,0,3,0,3,0,0,0,...,0,7,3,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,0,0,0,0,2,0,8,1,0,...,0,0,0,0,0,0,0,4,6,0
6,6,0,1,0,1,5,0,2,3,0,...,0,0,0,0,0,0,0,3,8,0
7,7,1,1,0,5,1,0,0,0,0,...,0,0,0,0,4,0,0,1,1,4
8,8,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,2,0,21,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


# 34 구매제품 변수의 각 파트의 빈도값 도출

In [645]:
f = pd.pivot_table(df_all, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,8
1,1,0,1,0,3,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,3
2,2,0,3,0,0,0,0,0,0,0,...,3,1,0,0,0,0,0,0,0,1
3,3,0,1,0,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,5,0,0,0,0,0,1,0,0,0,...,4,0,0,0,0,0,0,0,0,4
6,6,0,0,0,2,0,0,4,0,2,...,0,0,0,0,0,0,0,0,0,1
7,7,0,1,0,2,0,0,4,0,1,...,1,1,0,0,0,0,0,0,0,2
8,8,0,0,0,4,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,4
9,9,0,1,0,2,0,0,1,0,2,...,6,4,0,0,0,0,0,0,0,11


In [646]:
X_train3 = pd.DataFrame({'custid': df_train.custid.unique()})
for f in features :
    X_train3 = pd.merge(X_train3, f, how='left')
display(X_train3)

Unnamed: 0,custid,평균구매시간,퇴근 전 구매건수,퇴근 후 구매건수,근무시간 구매건수 비율,퇴근 후 구매건수 비율,평균 구매시간 내 구매건수,평균 구매시간 외 구매건수,평균 시간 구매비율,충동구매 여부_no,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,17.4,6,5,0.545455,0.454545,2,1,2.000000,0,...,1,0,0,0,0,0,0,0,0,8
1,1,15.1,24,1,0.960000,0.040000,5,5,1.000000,1,...,0,0,0,0,0,0,0,0,0,3
2,2,15.1,7,3,0.700000,0.300000,4,4,1.000000,1,...,3,1,0,0,0,0,0,0,0,1
3,3,15.8,25,2,0.925926,0.074074,4,5,0.800000,1,...,1,1,0,0,0,0,0,0,0,1
4,4,14.0,0,0,0.000000,0.000000,1,1,1.000000,1,...,0,0,0,0,0,0,0,0,0,1
5,5,15.5,26,6,0.812500,0.187500,4,5,0.800000,1,...,4,0,0,0,0,0,0,0,0,4
6,6,15.6,27,2,0.931034,0.068966,4,5,0.800000,1,...,0,0,0,0,0,0,0,0,0,1
7,7,16.2,33,1,0.970588,0.029412,4,4,1.000000,1,...,1,1,0,0,0,0,0,0,0,2
8,8,15.6,12,1,0.923077,0.076923,4,4,1.000000,1,...,1,1,1,0,0,0,0,0,0,4
9,9,15.4,45,12,0.789474,0.210526,5,5,1.000000,1,...,6,4,0,0,0,0,0,0,0,11


In [647]:
X_test3 = pd.DataFrame({'custid': df_test.custid.unique()})
for f in features :
    X_test3 = pd.merge(X_test3, f, how='left')
display(X_test3)

Unnamed: 0,custid,평균구매시간,퇴근 전 구매건수,퇴근 후 구매건수,근무시간 구매건수 비율,퇴근 후 구매건수 비율,평균 구매시간 내 구매건수,평균 구매시간 외 구매건수,평균 시간 구매비율,충동구매 여부_no,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,30000,17.0,20,7,0.740741,0.259259,3,4,0.750000,1,...,2,1,0,0,0,0,0,0,0,1
1,30001,17.2,26,1,0.962963,0.037037,3,2,1.500000,0,...,0,5,2,0,0,0,0,0,0,1
2,30002,14.4,86,5,0.945055,0.054945,6,5,1.200000,0,...,2,4,0,0,0,0,0,0,0,23
3,30003,16.0,26,6,0.812500,0.187500,4,5,0.800000,1,...,0,2,0,0,0,0,0,0,0,7
4,30004,16.3,36,13,0.734694,0.265306,4,7,0.571429,1,...,0,0,0,0,0,0,0,0,0,1
5,30005,19.0,0,1,0.000000,1.000000,1,1,1.000000,1,...,0,0,0,0,0,0,0,0,0,1
6,30006,14.7,7,0,1.000000,0.000000,2,3,0.666667,1,...,0,0,1,0,0,0,0,0,0,1
7,30007,17.5,4,2,0.666667,0.333333,2,3,0.666667,1,...,0,0,0,0,0,0,0,0,0,3
8,30008,15.9,9,0,1.000000,0.000000,3,3,1.000000,1,...,0,0,3,0,0,0,0,0,0,1
9,30009,17.0,3,0,1.000000,0.000000,1,1,1.000000,1,...,0,0,0,0,0,0,0,0,0,3


train_features = []
test_features = []

for f in features:
    train_features.append(f.iloc[0:30000])
    test_features.append(f.iloc[30000:])

for f in train_features :
    X_train =pd.concat([X_train ,f],axis=1)
    X_train = X_train.drop('custid',1)

In [648]:
#평균할인율 산출
X_train3["평균할인율"] = (X_train3["평균할인금액"] / X_train3["평균구매가격"])*100
X_test3["평균할인율"] = (X_test3["평균할인금액"] / X_test3["평균구매가격"])*100

In [649]:
#주말 요일별 구매비율
X_train3["토_비율"] = (X_train3["토_구매건수"] / X_train3["구매건수"])*100
X_train3["일_비율"] = (X_train3["일_구매건수"] / X_train3["구매건수"])*100
X_test3["토_비율"] = (X_test3["토_구매건수"] / X_test3["구매건수"])*100
X_test3["일_비율"] = (X_test3["일_구매건수"] / X_test3["구매건수"])*100

In [650]:
X_train3.columns

Index(['custid', '평균구매시간', '퇴근 전 구매건수', '퇴근 후 구매건수', '근무시간 구매건수 비율',
       '퇴근 후 구매건수 비율', '평균 구매시간 내 구매건수', '평균 구매시간 외 구매건수', '평균 시간 구매비율',
       '충동구매 여부_no',
       ...
       '행사장(남성)', '행사장(아동스포츠)', '행사장(여성정장)', '행사장(여성캐주얼)', '행사장(여성캐쥬)',
       '행사장(잡화)', '화장품', '평균할인율', '토_비율', '일_비율'],
      dtype='object', length=125)

In [651]:
# 치우친 데이터 로그 치환하기
f = X_train3.총구매액.where(X_train3.총구매액>=0, other=0)
f = np.log(f+1)
X_train3.총구매액 = f

f = X_test3.총구매액.where(X_test3.총구매액>=0, other=0)
f = np.log(f+1)
X_test3.총구매액 = f

f = X_train3.평균구매가격.where(X_train3.평균구매가격>=0, other=0)
f = np.log(f+1)
X_train3.평균구매가격 = f

f = X_test3.평균구매가격.where(X_test3.평균구매가격>=0, other=0)
f = np.log(f+1)
X_test3.평균구매가격 = f

f = X_train3.구매건수.where(X_train3.구매건수>=0, other=0)
f = np.log(f+1)
X_train3.구매건수 = f

f = X_test3.구매건수.where(X_test3.구매건수>=0, other=0)
f = np.log(f+1)
X_test3.구매건수 = f

f = X_train3.내점일수.where(X_train3.내점일수>=0, other=0)
f = np.log(f+1)
X_train3.내점일수 = f

f = X_test3.내점일수.where(X_test3.내점일수>=0, other=0)
f = np.log(f+1)
X_test3.내점일수 = f


f = X_train3.실제구매금액평균.where(X_train3.실제구매금액평균>=0, other=0)
f = np.log(f+1)
X_train3.실제구매금액평균 = f

f = X_test3.실제구매금액평균.where(X_test3.실제구매금액평균>=0, other=0)
f = np.log(f+1)
X_test3.실제구매금액평균 = f

f = X_train3.실제구매금액.where(X_train3.실제구매금액>=0, other=0)
f = np.log(f+1)
X_train3.실제구매금액 = f

f = X_test3.실제구매금액.where(X_test3.실제구매금액>=0, other=0)
f = np.log(f+1)
X_test3.실제구매금액 = f

In [652]:
IDtest = X_test3.custid;
X_train3.drop(['custid'], axis=1, inplace=True)
X_test3.drop(['custid'], axis=1, inplace=True)
y_train = pd.read_csv('y_train.csv').gender

for f in test_features :
    X_test3 =pd.concat([X_test ,f.reset_index()],axis=1)
    X_test3 = X_test3.drop('custid',1)
    X_test3 = X_test3.drop('index',1)

In [653]:
print( X_train3.shape , X_test3.shape)

(30000, 124) (19995, 124)


In [654]:
X_train3.head()

Unnamed: 0,평균구매시간,퇴근 전 구매건수,퇴근 후 구매건수,근무시간 구매건수 비율,퇴근 후 구매건수 비율,평균 구매시간 내 구매건수,평균 구매시간 외 구매건수,평균 시간 구매비율,충동구매 여부_no,충동구매 여부_yes,...,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품,평균할인율,토_비율,일_비율
0,17.4,6,5,0.545455,0.454545,2,1,2.0,0,1,...,0,0,0,0,0,0,8,10.000023,18.181818,45.454545
1,15.1,24,1,0.96,0.04,5,5,1.0,1,0,...,0,0,0,0,0,0,3,2.042877,34.615385,15.384615
2,15.1,7,3,0.7,0.3,4,4,1.0,1,0,...,0,0,0,0,0,0,1,6.800858,0.0,36.363636
3,15.8,25,2,0.925926,0.074074,4,5,0.8,1,0,...,0,0,0,0,0,0,1,3.984308,16.666667,20.0
4,14.0,0,0,0.0,0.0,1,1,1.0,1,0,...,0,0,0,0,0,0,1,2.086124,0.0,25.0


In [655]:
X_test3.head()

Unnamed: 0,평균구매시간,퇴근 전 구매건수,퇴근 후 구매건수,근무시간 구매건수 비율,퇴근 후 구매건수 비율,평균 구매시간 내 구매건수,평균 구매시간 외 구매건수,평균 시간 구매비율,충동구매 여부_no,충동구매 여부_yes,...,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품,평균할인율,토_비율,일_비율
0,17.0,20,7,0.740741,0.259259,3,4,0.75,1,0,...,0,0,0,0,0,0,1,3.742537,18.518519,37.037037
1,17.2,26,1,0.962963,0.037037,3,2,1.5,0,1,...,0,0,0,0,0,0,1,2.839392,18.518519,55.555556
2,14.4,86,5,0.945055,0.054945,6,5,1.2,0,1,...,0,0,0,0,0,0,23,3.934556,26.0,4.0
3,16.0,26,6,0.8125,0.1875,4,5,0.8,1,0,...,0,0,0,0,0,0,7,1.395798,18.604651,0.0
4,16.3,36,13,0.734694,0.265306,4,7,0.571429,1,0,...,0,0,0,0,0,0,1,0.234616,7.272727,18.181818


In [656]:
X_train.shape

(30000, 2392)

In [657]:
X_test.shape

(19995, 2392)

X_train.columns = list(range(0,2433))
X_test.columns = list(range(0,2433))

In [658]:
X_train=pd.concat([X_train,X_train3],axis=1)
X_test=pd.concat([X_test,X_test3],axis=1)

# Build Models

In [659]:
# Learn XGB
from xgboost import XGBClassifier
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")

model = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.03, n_jobs=-1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

# Make Submissions

In [660]:
pred = model.predict_proba(X_test)[:,1]
fname = 'submissions_OHE_0517(2).csv'
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submissions_OHE_0517(2).csv' is ready to submit.


# ensemble

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=2)
score = cross_val_score(gbc, X_train, y_train, cv=5, scoring='roc_auc')
print('{}\nmean = {:.5f}\nstd = {:.5f}'.format(score, score.mean(), score.std()))

[0.69649217 0.6978555  0.69199624 0.70537761 0.69574501]
mean = 0.69749
std = 0.00439


In [None]:
from xgboost import XGBClassifier
parameters = {'xgb__max_depth': 14, 'xgb__subsample': 0.4}
clf = XGBClassifier(**parameters, random_state=0, n_jobs=-1)
score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
print('{}\nmean = {:.5f}\nstd = {:.5f}'.format(score, score.mean(), score.std()))

[0.69954762 0.69858035 0.69422074 0.70543113 0.69591301]
mean = 0.69874
std = 0.00384


In [None]:
from sklearn.ensemble import VotingClassifier
votingC = VotingClassifier(estimators=[('gbc', gbc),('clf', clf)], voting='soft')
score = cross_val_score(votingC, X_train, y_train, cv=5, scoring='roc_auc')
print('{}\nmean = {:.5f}\nstd = {:.5f}'.format(score, score.mean(), score.std()))

In [None]:
pred = model.predict_proba(X_test)[:,1]
fname = 'submissions_OHE_0517(3).csv'
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

# End