In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import math

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
customer = pd.read_csv('../data/customer.csv',dtype=str)
online = pd.read_csv('../data/online_behavior.csv',dtype=str)
product = pd.read_csv('../data/product.csv',dtype=str)
transaction = pd.read_csv('../data/transaction.csv',dtype=str)

# 1.Customer

In [None]:
customer.groupby('clnt_gender').size()

In [None]:
plt.bar(index,customer.groupby('clnt_gender').size())
plt.xticks(index, label, fontsize=15)
plt.show

In [None]:
customer['clnt_gender'].loc[customer['clnt_gender']=='unknown'] = np.nan
customer['clnt_age'].loc[customer['clnt_age']=='unknown'] = np.nan
pd.crosstab(customer['clnt_gender'],customer['clnt_age'])

In [None]:
customer.to_pickle("../new_data/data_set/customer.pkl")

# 2. Online 

In [None]:
online.groupby('biz_unit').size()

In [None]:
online = online.loc[online['tot_pag_view_ct'].isnull()==False]

In [None]:
online = online.loc[online['tot_sess_hr_v'].isnull()==False]

In [None]:
online.to_pickle("../new_data/data_set/online.pkl")

# 3. Product 

In [None]:
product.loc[product['clac_nm3'].isnull()==True]
product['clac_nm3'].loc[product['pd_c']=='196'] = 'Packaged ETC'
product['clac_nm3'].loc[product['pd_c']=='524'] = 'Frozen ETC'
product.isnull().sum()

In [None]:
product.to_pickle("../new_data/data_set/product.pkl")

# 4. Transaction 

In [None]:
# transform biz unit
transaction['biz_unit'].loc[[i[0]=='A' for i in transaction['biz_unit']]] = 'A'
transaction['biz_unit'].loc[[i[0]=='B' for i in transaction['biz_unit']]] = 'B'

In [None]:
# transform de_dt, de_tm
def make_hit_time(date, time):
    time = date + time
    time = datetime.strptime(time, '%Y%m%d%H:%M')
    return time

make_hit_time = np.vectorize(make_hit_time)
transaction['date_time'] = make_hit_time(transaction['de_dt'], transaction['de_tm'])
transaction = transaction.drop(['de_dt','de_tm'],axis=1)

In [None]:
# delete pd_c na
transaction['pd_c'].loc[transaction['pd_c']=='unknown'] = np.nan
transaction = transaction.dropna(axis=0)

In [None]:
transaction[['buy_am','buy_ct']] = transaction[['buy_am','buy_ct']].astype(float)
transaction = transaction.loc[(transaction['buy_am'] > 0) & (transaction['buy_ct'] > 0)]

In [None]:
# outlier
buy_am = transaction['buy_am'].astype(float)
buy_am = sorted(buy_am,reverse=True)
list(buy_am)[0:10]

In [None]:
transaction.loc[transaction['buy_am']==100000016899]
product.loc[product['pd_c']=='382']
transaction = transaction.drop(19237,0)

In [None]:
transaction.shape

In [None]:
transaction.to_pickle("../new_data/data_set/transaction.pkl")

In [None]:
transaction.head()

# Clac data

## 1) online 

In [None]:
tran = transaction.loc[transaction['biz_unit']=='A']
tran = tran[['clnt_id','pd_c']]
tran['pd_c'] = tran['pd_c'].astype(int)
clac = product[['pd_c','clac_nm1']]
clac['pd_c'] = clac['pd_c'].astype(int)
clac = pd.merge(tran,clac,on='pd_c',how='left')
clac['count'] = 1
clac = pd.pivot_table(clac,index='clnt_id',columns='clac_nm1',values='count',aggfunc=sum)
clac = clac.fillna(0)

In [None]:
clac.to_pickle("../new_data/data_set/online_clac.pkl")

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = clac.corr(), linewidths=.5, cmap='Blues')

In [None]:
from sklearn.cluster import AgglomerativeClustering
n = 4
model = AgglomerativeClustering(n_clusters=n)
y_predict = model.fit_predict(clac.corr())

In [None]:
clac_new = []
for i in range(n):
    clac_new.extend(clac.columns[y_predict==i])
clac_new = clac[clac_new]

In [None]:
clac.columns[y_predict==3]

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = clac_new.corr(), linewidths=.5, cmap='Blues')

In [None]:
group1 = clac[clac.columns[y_predict==0]]
group1 = pd.melt(group1,var_name='clac').groupby('clac').sum()
group1['group'] = 1
group2 = clac[clac.columns[y_predict==1]]
group2 = pd.melt(group2,var_name='clac').groupby('clac').sum()
group2['group'] = 2
group3 = clac[clac.columns[y_predict==2]]
group3 = pd.melt(group3,var_name='clac').groupby('clac').sum()
group3['group'] = 3
group4 = clac[clac.columns[y_predict==3]]
group4 = pd.melt(group4,var_name='clac').groupby('clac').sum()
group4['group'] = 4
group = pd.concat([group1,group2,group3,group4],axis=0)
group = group.drop('value',axis=1)
group = group.reset_index()
group.columns = ['clac_nm1','group']

In [None]:
group = pd.merge(product,group,on='clac_nm1',how='left')

In [None]:
group.to_pickle("../new_data/data_set/online_group.pkl")

## offline 

In [None]:
tran = transaction.loc[transaction['biz_unit']=='B']
tran = tran[['clnt_id','pd_c']]
tran['pd_c'] = tran['pd_c'].astype(int)
clac = product[['pd_c','clac_nm1']]
clac['pd_c'] = clac['pd_c'].astype(int)
clac = pd.merge(tran,clac,on='pd_c',how='left')
clac['count'] = 1
clac = pd.pivot_table(clac,index='clnt_id',columns='clac_nm1',values='count',aggfunc=sum)
clac = clac.fillna(0)

In [None]:
clac.to_pickle("../new_data/data_set/offline_clac.pkl")

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = clac.corr(), linewidths=.5, cmap='Blues')

In [None]:
from sklearn.cluster import AgglomerativeClustering
n = 4
model = AgglomerativeClustering(n_clusters=n)
y_predict = model.fit_predict(clac.corr())

In [None]:
clac_new = []
for i in range(n):
    clac_new.extend(clac.columns[y_predict==i])
clac_new = clac[clac_new]

In [None]:
clac.columns[y_predict==3]

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = clac_new.corr(), linewidths=.5, cmap='Blues')

In [None]:
group1 = clac[clac.columns[y_predict==0]]
group1 = pd.melt(group1,var_name='clac').groupby('clac').sum()
group1['group'] = 1
group2 = clac[clac.columns[y_predict==1]]
group2 = pd.melt(group2,var_name='clac').groupby('clac').sum()
group2['group'] = 2
group3 = clac[clac.columns[y_predict==2]]
group3 = pd.melt(group3,var_name='clac').groupby('clac').sum()
group3['group'] = 3
group4 = clac[clac.columns[y_predict==3]]
group4 = pd.melt(group4,var_name='clac').groupby('clac').sum()
group4['group'] = 4
group = pd.concat([group1,group2,group3,group4],axis=0)
group = group.drop('value',axis=1)
group = group.reset_index()
group.columns = ['clac_nm1','group']

In [None]:
group = pd.merge(product,group,on='clac_nm1',how='left')

In [None]:
group.to_pickle("../new_data/data_set/offline_group.pkl")

In [None]:
group.groupby('group').size()