# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Read Data

In [2]:
df_train = pd.read_csv('X_train.csv', encoding='cp949')
df_test = pd.read_csv('X_test.csv', encoding='cp949')
y_train = pd.read_csv('y_train.csv').gender
IDtest = df_test.custid.unique()

df_train.head()

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,90000,9000,81000,3,0
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,39000,3900,35100,1,0
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,화장품,1,175000,17500,157500,3,0
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,수입명품,1,455000,45500,409500,3,0
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,100000,10000,90000,3,0


#### 판매데이터 구조
- custid: 고객ID
- sales_date: 판매일자 (문자)
- str_nm: 판매시간 (4자리 숫자)
- goodcd: 상품코드
- brd_nm: 브랜드명
- corner_nm: 코너명
- pc_nm: PC명
- part_nm: 파트명
- team_nm: 판매팀명
- buyer_nm: Buying MD
- import_flg: 수입상품여부
- tot_amt: 판매금액
- dis_amt: 할인금액
- net_amt: 판매금액 - 할인금액
- inst_mon: 할부개월수

<p>*`백화점 영업조직`: 306개 corner < 76개 pc < 29개 part < 3개 team*</p>
<p>*`y_train.csv`에서 target값 0은 남자, 1은 여자를 나타냄*</p>

# Make Features

In [183]:
features = []

In [184]:
#총 구매액
f=df_train.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
features.append(f); features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [185]:
# 총 구매건수
f=df_train.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
features.append(f) ; features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [5]:
# 평균구매가격
f=df_train.groupby('custid')['tot_amt'].agg([('평균구매가격', 'mean')]).reset_index()
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee,총구매액,구매건수,평균구매가격
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,90000,9000,81000,3,0,1742000,11,158363.636364
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,39000,3900,35100,1,0,1742000,11,158363.636364
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,화장품,1,175000,17500,157500,3,0,1742000,11,158363.636364
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,수입명품,1,455000,45500,409500,3,0,1742000,11,158363.636364
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,100000,10000,90000,3,0,1742000,11,158363.636364


In [6]:
#평균 할부개월수
f=df_train.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee,총구매액,구매건수,평균구매가격,평균할부개월수
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,1,90000,9000,81000,3,0,1742000,11,158363.636364,2.818182
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,1,39000,3900,35100,1,0,1742000,11,158363.636364,2.818182
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,1,175000,17500,157500,3,0,1742000,11,158363.636364,2.818182
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,1,455000,45500,409500,3,0,1742000,11,158363.636364,2.818182
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,100000,10000,90000,3,0,1742000,11,158363.636364,2.818182


In [7]:
#구매브랜드 종류
f=df_train.groupby('custid')['brd_nm'].agg([('구매브랜드종류', lambda x: x.nunique())]).reset_index()
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,tot_amt,dis_amt,net_amt,inst_mon,inst_fee,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,90000,9000,81000,3,0,1742000,11,158363.636364,2.818182,7
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,39000,3900,35100,1,0,1742000,11,158363.636364,2.818182,7
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,175000,17500,157500,3,0,1742000,11,158363.636364,2.818182,7
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,455000,45500,409500,3,0,1742000,11,158363.636364,2.818182,7
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,100000,10000,90000,3,0,1742000,11,158363.636364,2.818182,7


In [8]:
#내점일수
f=df_train.groupby('custid')['sales_date'].agg([('내점일수','nunique')]).reset_index()
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,dis_amt,net_amt,inst_mon,inst_fee,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,9000,81000,3,0,1742000,11,158363.636364,2.818182,7,7
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,3900,35100,1,0,1742000,11,158363.636364,2.818182,7,7
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,17500,157500,3,0,1742000,11,158363.636364,2.818182,7,7
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,45500,409500,3,0,1742000,11,158363.636364,2.818182,7,7
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,10000,90000,3,0,1742000,11,158363.636364,2.818182,7,7


In [9]:
#수입상품 구매비율
x = df_train[df_train['import_flg'] == 1].groupby('custid').size() / df_train.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품_구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1]*100).apply(round, args=(1,))
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,net_amt,inst_mon,inst_fee,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,81000,3,0,1742000,11,158363.636364,2.818182,7,7,63.6
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,35100,1,0,1742000,11,158363.636364,2.818182,7,7,63.6
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,157500,3,0,1742000,11,158363.636364,2.818182,7,7,63.6
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,409500,3,0,1742000,11,158363.636364,2.818182,7,7,63.6
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,90000,3,0,1742000,11,158363.636364,2.818182,7,7,63.6


In [10]:
#주말방문 비율
def fw(x):
    k = x.dayofweek
    if k <= 4 :
        return('주중_방문')
    else :
        return('주말_방문')    
    
df = df_train.copy()
df = df.drop_duplicates(['custid','sales_date'])

df['week'] = pd.to_datetime(df.sales_date).apply(fw)
df = pd.pivot_table(df, index='custid', columns='week', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
df['주말방문비율'] = ((df.iloc[:,1] / (df.iloc[:,1]+df.iloc[:,2]))*100).apply(round, args=(1,))
f = df.copy().iloc[:,[0,-1]]
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,inst_mon,inst_fee,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,3,0,1742000,11,158363.636364,2.818182,7,7,63.6,42.9
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,1,0,1742000,11,158363.636364,2.818182,7,7,63.6,42.9
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,3,0,1742000,11,158363.636364,2.818182,7,7,63.6,42.9
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,3,0,1742000,11,158363.636364,2.818182,7,7,63.6,42.9
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,3,0,1742000,11,158363.636364,2.818182,7,7,63.6,42.9


In [11]:
#계절별 구매건수
def f1(x):
    k = x.month
    if 3 <= k <= 5 :
        return('봄_구매건수')
    elif 6 <= k <= 8 :
        return('여름_구매건수')
    elif 9 <= k <= 11 :    
        return('가을_구매건수')
    else :
        return('겨울_구매건수')    
    
df_train['season'] = pd.to_datetime(df_train.sales_date).apply(f1)
f = pd.pivot_table(df_train, index='custid', columns='season', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,season,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,2.818182,7,7,63.6,42.9,여름_구매건수,3,3,1,4
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,2.818182,7,7,63.6,42.9,여름_구매건수,3,3,1,4
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,2.818182,7,7,63.6,42.9,여름_구매건수,3,3,1,4
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,2.818182,7,7,63.6,42.9,여름_구매건수,3,3,1,4
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,2.818182,7,7,63.6,42.9,가을_구매건수,3,3,1,4


In [12]:
#시간대별 구매건수
def f2(x):
    if 9 <= x <= 12 :
        return('아침_구매건수')
    elif 13 <= x <= 17 :
        return('점심_구매건수')
    else :
        return('저녁_구매건수')  # datatime 필드가 시간 형식에 맞지 않은 값을 갖는 경우 저녁시간으로 처리

df_train['timeslot'] = (df_train.sales_time // 100).apply(f2)
f = pd.pivot_table(df_train, index='custid', columns='timeslot', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주말방문비율,season,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,timeslot,아침_구매건수,저녁_구매건수,점심_구매건수
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,42.9,여름_구매건수,3,3,1,4,아침_구매건수,2,9,0
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,42.9,여름_구매건수,3,3,1,4,아침_구매건수,2,9,0
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,42.9,여름_구매건수,3,3,1,4,저녁_구매건수,2,9,0
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,42.9,여름_구매건수,3,3,1,4,저녁_구매건수,2,9,0
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,42.9,가을_구매건수,3,3,1,4,저녁_구매건수,2,9,0


In [129]:
#주구매코너
f = df_train.groupby('custid')['corner_nm'].agg([('주구매코너', lambda x: x.value_counts().reset_index().sort_values(by=['corner_nm','index'], ascending=False).iloc[0,0])]).reset_index()
f = pd.get_dummies(f, columns=['주구매코너']) 
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매코너_행사슈즈,주구매코너_행사핸드백,주구매코너_향수,주구매코너_헤어ACC,주구매코너_헤어악세사리,주구매코너_헤어액세사리,주구매코너_홈데코,주구매코너_화장잡화,주구매코너_화장품,주구매코너_훼미닌부틱
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,0,0,0,0
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,0,0,0,0
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,0,0,0,0,0,0,0,0
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,0,0,0,0,0,0,0,0
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#방문빈도성
df_train['month']=pd.to_datetime(df_train.sales_date).dt.month
a=df_train.groupby('custid')['month'].agg([('가장 최근 구매 달','max')]).reset_index()
b=df_train.groupby('custid')['month'].agg([('가장 예전 구매 달','min')]).reset_index()
c=df_train.groupby('custid')['month'].agg([('구매를 한 달의 수','nunique')]).reset_index()
a['방문빈도성']=round((a['가장 최근 구매 달']-b['가장 예전 구매 달'])/c['구매를 한 달의 수'],1)
a
f=a[['custid','방문빈도성']]
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매코너_영라이브,주구매코너_영어덜트캐쥬얼,주구매코너_영캐릭터,주구매코너_영플라자,주구매코너_잡화,주구매코너_잡화파트,"주구매코너_케주얼,구두,아동",주구매코너_패션잡화,month,방문빈도성
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,1,0,0,6,1.6
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,1,0,0,6,1.6
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,0,0,0,1,0,0,8,1.6
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,0,0,0,1,0,0,8,1.6
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,1,0,0,9,1.6


In [15]:
# 구매상품 다양성
f4=df_train.groupby('custid')['brd_nm'].agg([('구매상품 다양성','nunique')]).reset_index()
f4['구매상품 다양성']=f4['구매상품 다양성']/1191
f4=round(f4,2)
features.append(f4) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매코너_영어덜트캐쥬얼,주구매코너_영캐릭터,주구매코너_영플라자,주구매코너_잡화,주구매코너_잡화파트,"주구매코너_케주얼,구두,아동",주구매코너_패션잡화,month,방문빈도성,구매상품 다양성
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,1,0,0,6,1.6,0.01
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,1,0,0,6,1.6,0.01
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,0,0,1,0,0,8,1.6,0.01
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,0,0,1,0,0,8,1.6,0.01
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,1,0,0,9,1.6,0.01


In [189]:
# 고객 등급 변수
def g(x):
    if 118000.0<= x:
        return "VIP"
    elif 109197.6<= x<118000.0:
        return "GOLD"
    elif 53000.00<= x<109197.6:
        return "SILVER"
    else :
        return "customer"
df_train['고객 등급']=df_train['평균구매가격'].apply(g)
f=df_train['고객 등급']
features.append(f) ; features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [190]:
# 주 구매 상품 가격대
c2=df_train['평균구매가격']/df_train['구매건수']
def g(x):
    if x> 118000:
        return "초고가제품 주구매 고객"
    elif x> 109197.6:
        return "고가제품 주구매 고객"
    else :
        return "중저가제품 다량 주구매 고객"
df_train['구매 제품 성향 변수']=c2.apply(g)
f=df_train['구매 제품 성향 변수']
features.append(f) ; features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [191]:
# 분기별 구매금액
def g(x):
    if x> 6:
        return "하반기"
    else :
        return "상반기"
df_train['분기별'] = df_train['month'].apply(g)  
c3=pd.pivot_table(df_train,values='tot_amt',index='custid',columns='분기별',aggfunc='size').reset_index()
c3=c3.rename(columns={'상반기':'상반기 구매건수','하반기':'하반기 구매건수'})
c3
features.append(c3) ; features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [192]:
# 상반기, 하반기 구매 건수의 변화 (상반기->하반기 구매건수가 줄어들면, 이제 더이상 구매를 잘 하지 않는 것으로 간주)
f=df_train['상반기 구매건수']/df_train['하반기 구매건수']
f.head()
def g(x):
    if x>1 :
        return "소비 감소"
    elif x<1:
        return "소비 증가"
    else :
        return "소비 유지"
df_train['소비추세(구매건수)'] = f.apply(g)
f=df_train['소비추세(구매건수)']
features.append(f) ; features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [193]:
#주방문요일
df_train['day']=pd.to_datetime(df_train.sales_date).dt.day_name()
a=df_train.groupby(['custid','day'])['sales_time'].agg([('요일별 방문일수','size')]).reset_index()
a
b = a.groupby('custid')['요일별 방문일수'].max().reset_index()
c=pd.merge(a,b,on='custid')
c['요일'] = c['요일별 방문일수_x']/c['요일별 방문일수_y']
c=c.query("요일 ==1").iloc[:,[0,1,2]]
c0=c.groupby('custid')['day'].agg([('개수','size')]).reset_index().query('개수==1')
c0=pd.merge(c,c0,on='custid')
c0=c0.iloc[:,[0,1]]
c0

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [194]:
c1=c.groupby('custid')['day'].agg([('개수','size')]).reset_index().query('개수!=1')
c1=pd.merge(c,c1,on='custid')
c1

Unnamed: 0,custid,개수
9,9,2
12,12,2
17,17,2
28,28,2
29,29,2
31,31,2
43,43,2
44,44,3
47,47,3
54,54,2


In [94]:
d=df_train.groupby(['custid','day'])['tot_amt'].agg([('요일별 구매금액','sum')]).reset_index()
c3=pd.merge(d,c1,on=['custid','day'])
c4=c3.groupby('custid')['요일별 구매금액'].max().reset_index()
c5=pd.merge(c3,c4,on=['custid','요일별 구매금액'])
c5=c5.iloc[:,[0,1]]
c=pd.concat([c0,c5],axis=0)
c=c.rename(columns ={'day':'주방문요일'})

Unnamed: 0,custid,day,요일별 구매금액
0,0,Friday,262000
1,0,Monday,294000
2,0,Saturday,630000
3,0,Sunday,460000
4,0,Wednesday,96000
5,1,Friday,79100
6,1,Saturday,919000
7,1,Sunday,1076000
8,1,Thursday,390400
9,1,Wednesday,307600


In [126]:
features.append(c) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,구매상품 다양성,고객 등급,구매 제품 성향 변수,분기별,상반기 구매건수,하반기 구매건수,소비추세(구매건수),dday,day,주방문요일
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0.01,VIP,중저가제품 다량 주구매 고객,상반기,3.0,8.0,소비 증가,<bound method PandasDelegate._add_delegate_acc...,Sunday,Sunday
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0.01,VIP,중저가제품 다량 주구매 고객,상반기,3.0,8.0,소비 증가,<bound method PandasDelegate._add_delegate_acc...,Sunday,Sunday
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0.01,VIP,중저가제품 다량 주구매 고객,하반기,3.0,8.0,소비 증가,<bound method PandasDelegate._add_delegate_acc...,Saturday,Sunday
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0.01,VIP,중저가제품 다량 주구매 고객,하반기,3.0,8.0,소비 증가,<bound method PandasDelegate._add_delegate_acc...,Saturday,Sunday
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0.01,VIP,중저가제품 다량 주구매 고객,하반기,3.0,8.0,소비 증가,<bound method PandasDelegate._add_delegate_acc...,Sunday,Sunday


In [130]:
#주구매브랜드
f = df_train.groupby('custid')['part_nm'].agg([('주구매브랜드', lambda x: x.value_counts().reset_index().sort_values(by=['part_nm','index'], ascending=False).iloc[0,0])]).reset_index()
f = pd.get_dummies(f, columns=['주구매브랜드']) 
features.append(f) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매브랜드_여성캐주얼,주구매브랜드_여성캐쥬얼,주구매브랜드_영라이브,주구매브랜드_영어덜트캐쥬얼,주구매브랜드_영캐릭터,주구매브랜드_영플라자,주구매브랜드_잡화,주구매브랜드_잡화파트,"주구매브랜드_케주얼,구두,아동",주구매브랜드_패션잡화
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,0,1,0,0
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,0,1,0,0
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,0,0,0,0,0,1,0,0
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,0,0,0,0,0,1,0,0
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,0,1,0,0


In [136]:
#최대 할부개월수
a=df_train.groupby('custid')['inst_mon'].agg([('최대할부개월수', 'max')]).reset_index()
a
features.append(a) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매브랜드_여성캐쥬얼,주구매브랜드_영라이브,주구매브랜드_영어덜트캐쥬얼,주구매브랜드_영캐릭터,주구매브랜드_영플라자,주구매브랜드_잡화,주구매브랜드_잡화파트,"주구매브랜드_케주얼,구두,아동",주구매브랜드_패션잡화,최대할부개월수
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,1,0,0,3
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,1,0,0,3
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,0,0,0,0,1,0,0,3
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,0,0,0,0,1,0,0,3
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,0,1,0,0,3


In [138]:
# 브랜드 구매수
a=df_train.groupby('custid')['brd_nm'].agg([('구매 브랜드 수','nunique')]).reset_index()
features.append(a) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매브랜드_영라이브,주구매브랜드_영어덜트캐쥬얼,주구매브랜드_영캐릭터,주구매브랜드_영플라자,주구매브랜드_잡화,주구매브랜드_잡화파트,"주구매브랜드_케주얼,구두,아동",주구매브랜드_패션잡화,최대할부개월수,구매 브랜드 수
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,1,0,0,3,7
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,1,0,0,3,7
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,0,0,0,1,0,0,3,7
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,0,0,0,1,0,0,3,7
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,0,0,0,1,0,0,3,7


In [155]:
#주구매 시간
df_train['hour']=df_train['sales_time'].astype(str).str[0:2] 
a=df_train.groupby(['custid','hour'])['sales_date'].agg([('건수','size')]).reset_index()
a1=a.groupby('custid')['건수'].max().reset_index()
a2=pd.merge(a,a1,on=['custid','건수'])
a2=a2.iloc[:,[0,1]].rename(columns={'hour':'주구매시간'})
features.append(a2) ; features

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,...,주구매브랜드_영플라자,주구매브랜드_잡화,주구매브랜드_잡화파트,"주구매브랜드_케주얼,구두,아동",주구매브랜드_패션잡화,최대할부개월수,구매 브랜드 수,hour,주구매시간_x,주구매시간_y
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,1,0,0,3,7,12,19,19
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,1,0,0,3,7,12,19,19
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,...,0,0,1,0,0,3,7,18,19,19
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,...,0,0,1,0,0,3,7,18,19,19
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,...,0,0,1,0,0,3,7,18,19,19


In [160]:
#브랜드 편중도 = 해당 고객이 구매한 브랜드 수 / 전체 브랜드 수
df_train['브랜드 편중도']=df_train['구매 브랜드 수']/df_train['brd_nm'].nunique()
f=df_train['브랜드 편중도']
features.append(f) ; features

In [195]:
# 내점당 평균 구매건수 

f = df_train.groupby('custid')['sales_date'].agg([('하','value_counts')]).reset_index()
f= f.groupby('custid')['하'].agg([('내점당 평균 구매건수','mean')]).reset_index()
f['내점당 평균 구매건수'] = round(f['내점당 평균 구매건수'],2)
features.append(f) ; features

[       custid      총구매액
 0           0   1742000
 1           1   2772100
 2           2   3750850
 3           3   2300500
 4           4   1045000
 5           5   5053759
 6           6  15140116
 7           7   1223182
 8           8   1267500
 9           9   4956620
 10         10   1347970
 11         11   7173999
 12         12   2595477
 13         13   8789931
 14         14   1300720
 15         15  11780260
 16         16   5431891
 17         17   4593588
 18         18   9302600
 19         19   1078340
 20         20  11422000
 21         21   1387995
 22         22   4649311
 23         23   4723684
 24         24   2005200
 25         25   1032700
 26         26  16330700
 27         27  14917012
 28         28   1639035
 29         29    390480
 ...       ...       ...
 29970   29970  15103800
 29971   29971   1205500
 29972   29972   9684560
 29973   29973   8794750
 29974   29974   7474568
 29975   29975   9045600
 29976   29976    454000
 29977   29977    660100


In [None]:
# 회원별 구매 집중도

f = df_train.groupby('custid')['brd_nm'].agg([('Diver', 'unique')]).reset_index()
f['다른브랜드']=0

for i in range(0,500):
    f.iloc[i,2] = f.iloc[i,1].size

g = df_train.groupby('custid')['brd_nm'].agg( [('브랜드별구매수','value_counts')]).reset_index()
h = g.groupby('custid')['브랜드별구매수'].agg( [('브랜드별최다구매수','max')]).reset_index()
g_h = pd.merge(g,h,on='custid');g_h
g_h.loc[g_h['브랜드별구매수'] == g_h['브랜드별최다구매수'],'최다구매브랜드'] = 'ㅋㅋ' 
g_h = g_h.dropna(how='any') 
g_h = g_h.drop_duplicates(['custid']);g_h

ff = df_train.groupby('custid')['tot_amt'].agg([('총구매횟수','size')]).reset_index()
ff_g_h = pd.merge(ff,g_h,on='custid');ff_g_h 
ff_g_h['구매집중도'] = round((ff_g_h['브랜드별최다구매수']/ff_g_h['총구매횟수'])*100,2)
ff_g_h
ff_g_h = ff_g_h.drop(['브랜드별구매수','총구매횟수','brd_nm','브랜드별최다구매수','최다구매브랜드'],1)
ff_g_h
features.append(ff_g_h) ; features

# Test Set Make Features

In [None]:
test_features=[]

In [None]:
# 총 구매액
a=df_test.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
test_features.append(a)

# 총 구매건수
a=df_test.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
test_features.append(a)

# 평균구매가격
a=df_test.groupby('custid')['tot_amt'].agg([('평균구매가격', 'mean')]).reset_index()
test_features.append(a)

#평균 할부개월수
a=df_test.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
test_features.append(a)

#구매브랜드 종류
a=df_test.groupby('custid')['brd_nm'].agg([('구매브랜드종류', lambda x: x.nunique())]).reset_index()
test_features.append(a)

#내점일수
a=df_test.groupby('custid')['sales_date'].agg([('내점일수','nunique')]).reset_index()
test_features.append(a)

#수입상품 구매비율
x = df_test[df_test['import_flg'] == 1].groupby('custid').size() / df_test.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품_구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1]*100).apply(round, args=(1,))
test_features.append(f)

#주말방문 비율
def fw(x):
    k = x.dayofweek
    if k <= 4 :
        return('주중_방문')
    else :
        return('주말_방문')    
    
df = df_test.copy()
df = df.drop_duplicates(['custid','sales_date'])

df['week'] = pd.to_datetime(df.sales_date).apply(fw)
df = pd.pivot_table(df, index='custid', columns='week', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
df['주말방문비율'] = ((df.iloc[:,1] / (df.iloc[:,1]+df.iloc[:,2]))*100).apply(round, args=(1,))
f = df.copy().iloc[:,[0,-1]]
test_features.append(f)

#계절별 구매건수
def f1(x):
    k = x.month
    if 3 <= k <= 5 :
        return('봄_구매건수')
    elif 6 <= k <= 8 :
        return('여름_구매건수')
    elif 9 <= k <= 11 :    
        return('가을_구매건수')
    else :
        return('겨울_구매건수')    
    
df_test['season'] = pd.to_datetime(df_test.sales_date).apply(f1)
f = pd.pivot_table(df_test, index='custid', columns='season', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
test_features.append(f)

#시간대별 구매건수
def f2(x):
    if 9 <= x <= 12 :
        return('아침_구매건수')
    elif 13 <= x <= 17 :
        return('점심_구매건수')
    else :
        return('저녁_구매건수')  # datatime 필드가 시간 형식에 맞지 않은 값을 갖는 경우 저녁시간으로 처리

df_test['timeslot'] = (df_test.sales_time // 100).apply(f2)
f = pd.pivot_table(df_test, index='custid', columns='timeslot', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
test_features.append(f)

#주구매코너
f = df_test.groupby('custid')['part_nm'].agg([('주구매코너', lambda x: x.value_counts().reset_index().sort_values(by=['part_nm','index'], ascending=False).iloc[0,0])]).reset_index()
f = pd.get_dummies(f, columns=['주구매코너']) 
test_features.append(f)

#방문빈도성
df_test['month']=pd.to_datetime(df_test.sales_date).dt.month
a=df_test.groupby('custid')['month'].agg([('가장 최근 구매 달','max')]).reset_index()
b=df_test.groupby('custid')['month'].agg([('가장 예전 구매 달','min')]).reset_index()
c=df_test.groupby('custid')['month'].agg([('구매를 한 달의 수','nunique')]).reset_index()
a['방문빈도성']=round((a['가장 최근 구매 달']-b['가장 예전 구매 달'])/c['구매를 한 달의 수'],1)
a
f=a[['custid','방문빈도성']]
test_features.append(f)

# 구매상품 다양성
f4=df_test.groupby('custid')['brd_nm'].agg([('구매상품 다양성','nunique')]).reset_index()
f4['구매상품 다양성']=f4['구매상품 다양성']/1191
f4=round(f4,2)
test_features.append(f4)


# 고객 등급 변수
def g(x):
    if 118000.0<= x:
        return "VIP"
    elif 109197.6<= x<118000.0:
        return "GOLD"
    elif 53000.00<= x<109197.6:
        return "SILVER"
    else :
        return "customer"
df_test['고객 등급']=df_test['평균구매가격'].apply(g)
f=df_test['고객 등급']
test_features.append(f)

In [None]:
# 주 구매 상품 가격대
c2=df_test['평균구매가격']/df_test['구매건수']
def g(x):
    if x> 118000:
        return "초고가제품 주구매 고객"
    elif x> 109197.6:
        return "고가제품 주구매 고객"
    else :
        return "중저가제품 다량 주구매 고객"
df_test['구매 제품 성향 변수']=c2.apply(g)
f=df_test['구매 제품 성향 변수']
test_features.append(f) 


# 분기별 구매금액
def g(x):
    if x> 6:
        return "하반기"
    else :
        return "상반기"
df_test['분기별'] = df_test['month'].apply(g)  
c3=pd.pivot_table(df_test,values='tot_amt',index='custid',columns='분기별',aggfunc='size').reset_index()
c3=c3.rename(columns={'상반기':'상반기 구매건수','하반기':'하반기 구매건수'})
c3
test_features.append(c3)


# 상반기, 하반기 구매 건수의 변화 (상반기->하반기 구매건수가 줄어들면, 이제 더이상 구매를 잘 하지 않는 것으로 간주)
f=df_test['상반기 구매건수']/df_test['하반기 구매건수']
def g(x):
    if x>1 :
        return "소비 감소"
    elif x<1:
        return "소비 증가"
    else :
        return "소비 유지"
df_test['소비추세(구매건수)'] = f.apply(g)
f=df_train['소비추세(구매건수)']
test_features.append(f) 

#주방문요일
df_test['day']=pd.to_datetime(df_test.sales_date).dt.day_name()
a=df_test.groupby(['custid','day'])['sales_time'].agg([('요일별 방문일수','size')]).reset_index()
a
b = a.groupby('custid')['요일별 방문일수'].max().reset_index()
c=pd.merge(a,b,on='custid')
c['요일'] = c['요일별 방문일수_x']/c['요일별 방문일수_y']
c=c.query("요일 ==1").iloc[:,[0,1,2]]
c0=c.groupby('custid')['day'].agg([('개수','size')]).reset_index().query('개수==1')
c0=pd.merge(c,c0,on='custid')
c0=c0.iloc[:,[0,1]]
c1=c.groupby('custid')['day'].agg([('개수','size')]).reset_index().query('개수!=1')
c1=pd.merge(c,c1,on='custid')
d=df_test.groupby(['custid','day'])['tot_amt'].agg([('요일별 구매금액','sum')]).reset_index()
c3=pd.merge(d,c1,on=['custid','day'])
c4=c3.groupby('custid')['요일별 구매금액'].max().reset_index()
c5=pd.merge(c3,c4,on=['custid','요일별 구매금액'])
c5=c5.iloc[:,[0,1]]
c=pd.concat([c0,c5],axis=0)
c=c.rename(columns ={'day':'주방문요일'})
test_features.append(c)

#주구매브랜드
f = df_test.groupby('custid')['part_nm'].agg([('주구매브랜드', lambda x: x.value_counts().reset_index().sort_values(by=['part_nm','index'], ascending=False).iloc[0,0])]).reset_index()
f = pd.get_dummies(f, columns=['주구매브랜드']) 
test_features.append(f) 

#최대 할부개월수
a=df_test.groupby('custid')['inst_mon'].agg([('최대할부개월수', 'max')]).reset_index()
a
test_features.append(a) 

# 브랜드 구매수
a=df_test.groupby('custid')['brd_nm'].agg([('구매 브랜드 수','nunique')]).reset_index()
test_features.append(a) 

#주구매 시간
df_test['hour']=df_test['sales_time'].astype(str).str[0:2] 
a=df_test.groupby(['custid','hour'])['sales_date'].agg([('건수','size')]).reset_index()
a1=a.groupby('custid')['건수'].max().reset_index()
a2=pd.merge(a,a1,on=['custid','건수'])
a2=a2.iloc[:,[0,1]].rename(columns={'hour':'주구매시간'})
test_features.append(a2) 

#브랜드 편중도 = 해당 고객이 구매한 브랜드 수 / 전체 브랜드 수
df_test['브랜드 편중도']=df_test['구매 브랜드 수']/df_test['brd_nm'].nunique()
f=df_test['브랜드 편중도']
test_features.append(f)  

# 내점당 평균 구매건수 

f = df_test.groupby('custid')['sales_date'].agg([('하','value_counts')]).reset_index()
f= f.groupby('custid')['하'].agg([('내점당 평균 구매건수','mean')]).reset_index()
f['내점당 평균 구매건수'] = round(f['내점당 평균 구매건수'],2)
test_features.append(f)  

# 회원별 구매 집중도
f = df_test.groupby('custid')['brd_nm'].agg([('Diver', 'unique')]).reset_index()
f['다른브랜드']=0

for i in range(0,500):
    f.iloc[i,2] = f.iloc[i,1].size

g = df_test.groupby('custid')['brd_nm'].agg( [('브랜드별구매수','value_counts')]).reset_index()
h = g.groupby('custid')['브랜드별구매수'].agg( [('브랜드별최다구매수','max')]).reset_index()
g_h = pd.merge(g,h,on='custid');g_h
g_h.loc[g_h['브랜드별구매수'] == g_h['브랜드별최다구매수'],'최다구매브랜드'] = 'ㅋㅋ' 
g_h = g_h.dropna(how='any') 
g_h = g_h.drop_duplicates(['custid']);g_h

ff = df_test.groupby('custid')['tot_amt'].agg([('총구매횟수','size')]).reset_index()
ff_g_h = pd.merge(ff,g_h,on='custid');ff_g_h 
ff_g_h['구매집중도'] = round((ff_g_h['브랜드별최다구매수']/ff_g_h['총구매횟수'])*100,2)
ff_g_h
ff_g_h = ff_g_h.drop(['브랜드별구매수','총구매횟수','brd_nm','브랜드별최다구매수','최다구매브랜드'],1)
ff_g_h
test_features.append(ff_g_h) ; test_features

# Transform Data with One-hot Encoding

In [None]:
df_train = df_train.drop(['sales_date','str_nm','brd_nm','corner_nm','pc_nm','part_nm','team_nm','buyer_nm','season','timeslot','주구매코너_가정용품','주구매코너_가정용품파트','주구매코너_골프/유니캐쥬얼','주구매코너_공산품','주구매코너_공산품파트','주구매코너_남성의류' , '주구매코너_남성정장스포츠','주구매코너_로얄부띠끄','주구매코너_로얄부틱','주구매코너_명품잡화','주구매코너_생식품','주구매코너_생식품파트','주구매코너_스포츠캐주얼','주구매코너_스포츠캐쥬얼','주구매코너_아동','주구매코너_아동,스포츠' , '주구매코너_아동문화' , '주구매코너_여성의류파트' , '주구매코너_여성정장' , '주구매코너_여성캐주얼' , '주구매코너_여성캐쥬얼' , '주구매코너_영라이브' ,'주구매코너_영어덜트캐쥬얼' , '주구매코너_영캐릭터','주구매코너_영플라자' , '주구매코너_잡화' , '주구매코너_잡화파트' , '주구매코너_케주얼,구두,아동' , '주구매코너_패션잡화' , '고객 등급' , '구매 제품 성향 변수' , '분기별' , '소비추세(구매건수)' ],1)

df_test = df_test.drop(['sales_date','str_nm','brd_nm','corner_nm','pc_nm','part_nm','team_nm','buyer_nm','season','timeslot','주구매코너_가정용품','주구매코너_가정용품파트','주구매코너_골프/유니캐쥬얼','주구매코너_공산품','주구매코너_공산품파트','주구매코너_남성의류' , '주구매코너_남성정장스포츠','주구매코너_로얄부띠끄','주구매코너_로얄부틱','주구매코너_명품잡화','주구매코너_생식품','주구매코너_생식품파트','주구매코너_스포츠캐주얼','주구매코너_스포츠캐쥬얼','주구매코너_아동','주구매코너_아동,스포츠' , '주구매코너_아동문화' , '주구매코너_여성의류파트' , '주구매코너_여성정장' , '주구매코너_여성캐주얼' , '주구매코너_여성캐쥬얼' , '주구매코너_영라이브' ,'주구매코너_영어덜트캐쥬얼' , '주구매코너_영캐릭터','주구매코너_영플라자' , '주구매코너_잡화' , '주구매코너_잡화파트' , '주구매코너_케주얼,구두,아동' , '주구매코너_패션잡화' , '고객 등급' , '구매 제품 성향 변수' , '분기별' , '소비추세(구매건수)' ],1)

df_all = pd.concat([df_train, df_test])
X_train = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=lambda x: np.where(len(x) >=1, 1, 0), fill_value=0). \
                         reset_index(). \
                         query('custid not in @IDtest'). \
                         drop(columns=['custid']).values
X_test = pd.pivot_table(df_all, index='custid', columns=level, values='tot_amt',
                         aggfunc=lambda x: np.where(len(x) >=1, 1, 0), fill_value=0). \
                         reset_index(). \
                         query('custid in @IDtest'). \
                         drop(columns=['custid']).values

max_features = X_train.shape[1]

# Build Models

In [22]:
# Learn XGB
from xgboost import XGBClassifier
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")

model = XGBClassifier(random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

# Make Submissions

In [23]:
pred = model.predict_proba(X_test)[:,1]
fname = 'submissions_OHE.csv'
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submissions_OHE.csv' is ready to submit.


# End