<font color = "#CC3D3D">
## Automated Feature Engineering with Featuretools

### Imports

In [1]:
import pandas as pd
import featuretools as ft

### Read Data

In [2]:
df_train = pd.read_csv('X_train_hds.csv', encoding='cp949', dtype={'goodcd': 'category'})
df_test = pd.read_csv('X_test_hds.csv', encoding='cp949', dtype={'goodcd': 'category'})
# Assign a unique ID for each transaction 
df = pd.concat([df_train, df_test]).reset_index(drop=True).reset_index().rename(columns={'index': 'transid'})
df.head()

Unnamed: 0,transid,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee
0,0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,90000,9000,81000,3,0
1,1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,39000,3900,35100,1,0
2,2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,화장품,1,175000,17500,157500,3,0
3,3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,수입명품,1,455000,45500,409500,3,0
4,4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,100000,10000,90000,3,0


### Prepare data

In [3]:
# A dataframe to create a feature matrix for each customer 
cu = pd.DataFrame({'custid': df.custid.unique()})

# Specify a dictionary with all the entities
entities = {
    "cust": (cu, "custid"),
    "trans": (df, "transid")
}

# Specify how the entities are related
relationships = [
    ("cust", "custid", "trans", "custid")
]

### Run Deep Feature Synthesis

In [4]:
%%time
derived_features, _ = ft.dfs(entities=entities, relationships=relationships, target_entity="cust")
derived_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49995 entries, 0 to 49994
Data columns (total 67 columns):
SUM(trans.sales_time)                    49995 non-null int64
SUM(trans.import_flg)                    49995 non-null int64
SUM(trans.tot_amt)                       49995 non-null int64
SUM(trans.dis_amt)                       49995 non-null int64
SUM(trans.net_amt)                       49995 non-null int64
SUM(trans.inst_mon)                      49995 non-null int64
SUM(trans.inst_fee)                      49995 non-null int64
STD(trans.sales_time)                    48850 non-null float64
STD(trans.import_flg)                    48850 non-null float64
STD(trans.tot_amt)                       48850 non-null float64
STD(trans.dis_amt)                       48850 non-null float64
STD(trans.net_amt)                       48850 non-null float64
STD(trans.inst_mon)                      48850 non-null float64
STD(trans.inst_fee)                      48850 non-null float64
MAX(trans

In [5]:
derived_features.head()

Unnamed: 0_level_0,SUM(trans.sales_time),SUM(trans.import_flg),SUM(trans.tot_amt),SUM(trans.dis_amt),SUM(trans.net_amt),SUM(trans.inst_mon),SUM(trans.inst_fee),STD(trans.sales_time),STD(trans.import_flg),STD(trans.tot_amt),...,MODE(trans.team_nm),MODE(trans.buyer_nm),NUM_UNIQUE(trans.DAY(sales_date)),NUM_UNIQUE(trans.YEAR(sales_date)),NUM_UNIQUE(trans.MONTH(sales_date)),NUM_UNIQUE(trans.WEEKDAY(sales_date)),MODE(trans.DAY(sales_date)),MODE(trans.YEAR(sales_date)),MODE(trans.MONTH(sales_date)),MODE(trans.WEEKDAY(sales_date))
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,19328,7,1742000,174200,1567800,31,0,266.443786,0.504525,118540.518581,...,잡화가용팀,화장품,7,2,5,5,3,2000,9,6
1,39810,11,2772100,56630,2715470,64,0,263.38302,0.503831,109270.336393,...,의류패션팀,수입명품,12,2,7,5,13,2000,6,5
2,16908,1,3750850,255090,3495760,38,3,352.567002,0.301511,791625.02932,...,잡화가용팀,가전,7,2,6,3,27,2000,8,2
3,48122,0,2300500,91660,2208840,80,3,274.230199,0.0,191949.716985,...,잡화가용팀,스포츠,11,2,9,5,20,2000,7,3
4,5736,1,1045000,21800,1023200,18,2,21.087121,0.5,250313.636598,...,의류패션팀,캐릭터캐주얼,2,1,1,2,27,2000,7,3


In [6]:
# One-hot encoding for categorical features
derived_features.drop(['MODE(trans.goodcd)'], axis=1, inplace=True)
derived_features = pd.get_dummies(derived_features).reset_index()

# Fill NA
derived_features.fillna(0, inplace=True)

# Split Data
X_train = pd.DataFrame({'custid': df_train.custid.unique()})
X_train = pd.merge(X_train, derived_features, how='left')

X_test = pd.DataFrame({'custid': df_test.custid.unique()})
X_test = pd.merge(X_test, derived_features, how='left')

# Remove unnecessary features
IDtest = X_test.custid;
X_train.drop(['custid'], axis=1, inplace=True)
X_test.drop(['custid'], axis=1, inplace=True)
y_train = pd.read_csv('y_train_hds.csv').gender

### Build Models

In [7]:
# Learn XGB
from xgboost import XGBClassifier
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")

model = XGBClassifier(random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

### Make Submissions

In [9]:
pred = model.predict_proba(X_test)[:,1]
fname = 'submission_DFS.csv'
submissions = pd.concat([IDtest, pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submission_DFS.csv' is ready to submit.


<font color = "#CC3D3D">
## End