In [267]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
%pylab inline
color = sns.color_palette()
#https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-v3-0

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [265]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
  
    cols = [x for x in list(df.columns) ]
    
    for col in tqdm(cols):
        col_type = df[col].dtype

        if col_type not in [object]:

            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [268]:
# 데이터 불러와서 메모리 최적화 시킨 후 객체 저장하기

df= pd.read_csv('train_ver2.csv')
df = reduce_mem_usage(df)
df["fecha_dato"] = pd.to_datetime(df["fecha_dato"],format="%Y-%m-%d")
df["fecha_alta"] = pd.to_datetime(df["fecha_alta"],format="%Y-%m-%d")
df.to_pickle('train_pkl.pkl')

tst= pd.read_csv('test_ver2.csv')
tst= reduce_mem_usage(tst)
tst["fecha_dato"] = pd.to_datetime(tst["fecha_dato"],format="%Y-%m-%d")
tst["fecha_alta"] = pd.to_datetime(tst["fecha_alta"],format="%Y-%m-%d")
tst.to_pickle('test_pkl.pkl')



  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [258]:
# df = pd.read_pickle('train_pkl.pkl')
# tst= pd.read_pickle('test_pkl.pkl')

In [269]:
df.columns[24:]
for col in df.columns[24:]:
    tst[col] = 0
df = pd.concat([df,tst],axis = 0)

In [270]:
# 24개 금융 제품에 대한 '신규 구매 데이터 생성하기'
prods = df.columns[24:].tolist()

# 날짜를 숫자로 변환  : 
def date_to_int(str_date):
    Y,M,D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y)- 2015)* 12 + int(M)
    return int_date

df['int_date'] = df['fecha_dato'].astype('str').map(date_to_int).astype(np.int8)

# int_date를 기반으로 lag를 생성한다. 데이터를 복사해서
df_lag = df.copy()
df_lag['int_date'] +=1
df_lag.columns = [col+'_prev' if col not in ['ncodpers','int_date'] else col for col in df.columns] ## 컬럼 이름 바꿔주기

# 원본데이터와 lag 데이터 합치기 : ncodpers, int_date 기준으로
df = pd.merge(df,df_lag, on= ['ncodpers','int_date'], how = 'left')

del df_lag
gc.collect()
#저번달의 제품 정보가 없으면 0으로 대체
for prod in prods:
    prev = prod+'_prev'
    df[prev].fillna(0, inplace = True)


#     tst[prev].fillna(0, inplace =True)
    
# #신규 구매 변수 padd
# for prod in prods : 
#     padd = prod + '_add'
#     prev = prod + '_prev'
#     df[padd] = ((df[prod] ==1)& (df[prev]==0)).astype(np.int8) # 이전에는 0이었는데 이번에 1이된 상품에 1
    
# # 신규 구매 변수만 추출 
# add_cols = [prod + '_add' for prod in prods]
# labels = df[add_cols].copy()
# labels.columns = prods



In [None]:
# df[prev].fillna(-999, inplace =True)

In [None]:
# 중복 제거가 아니라 신규 구매한 row만 가지고 학습 
X, Y = [],[]
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = df[(df[prod]==1)& (df[prev]==0)] # 이전 0인데 이번에 신규로 구매한 row들 찾기
    prY = np.zeros(prX.shape[0], dtype = np.int8) +i
    X.append(prX)
    Y.append(prY)
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y


    

In [None]:
XY.shape

In [224]:
df = XY.copy()

In [225]:
# Target Columns 이 총 24개 이다. 

# http://alanpryorjr.com/2016-12-19-Kaggle-Competition-Santander-Solution/ 
print(df.shape)
print(tst.shape)

(1778875, 97)
(929615, 48)


In [7]:
# # Visualization
# # Month
# df['month'] = pd.DatetimeIndex(df['fecha_dato']).month
# df['year'] = pd.DatetimeIndex(df['fecha_dato']).year


## 데이터 전처리 및 모델링

In [226]:
# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
# import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# from xgboost import XGBClassifier
import xgboost as xgb
# import lightgbm as lgb

In [9]:
# 전처리
# null 값이 많은 것만 제거하기
# df.isnull().sum()
# df = df[~df['ind_empleado'].isna()]


In [227]:
# target_cols = ['ind_cco_fin_ult1', 'ind_cder_fin_ult1',
#                              'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
#                              'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
#                              'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1',
#                              'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
#                              'ind_ecue_fin_ult1', 'ind_fond_fin_ult1',
#                              'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
#                              'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
#                              'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
#                              'ind_viv_fin_ult1', 'ind_nomina_ult1',
#                              'ind_nom_pens_ult1', 'ind_recibo_ult1',
#                              'ind_ahor_fin_ult1', 'ind_aval_fin_ult1' ]
# cols = [col for col in XY.columns if col[-4:] =='prev'] 
# target_cols = target_cols + cols  +['y']
# x_train = XY.drop(['y'], axis =1)
# y_train = XY['y']
# # y_train = y_train.drop(['ind_ahor_fin_ult1', 'ind_aval_fin_ult1'],axis =1)
# x_test = tst.copy()

x_train = XY[XY['fecha_dato']!='2016-05-28']
y_train = x_train['y']
x_train = x_train.drop(['y'],axis =1)
x_test = XY[XY['fecha_dato']=='2016-05-28']

In [228]:
x_train['age'] = x_train['age'].astype('str').map(str.strip).replace(['NA'],value=-999).astype(float)
x_test['age'] = x_test['age'].astype('str').map(str.strip).replace(['NA'],value=-999).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [229]:
# antiguedad 전처리

x_train['antiguedad'] = x_train['antiguedad'].astype('str').map(str.strip)
x_train['antiguedad'] = x_train['antiguedad'].replace(['NA'],value=-999).astype('float')
x_test['antiguedad'] = x_test['antiguedad'].astype('str').map(str.strip)
x_test['antiguedad'] = x_test['antiguedad'].replace(['NA'],value=-999).astype('float')
    
# x_train['antiguedad'].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [230]:
# indrel_1mes 전처리
x_train['indrel_1mes'] = x_train['indrel_1mes'].astype('str').map(str.strip)
x_train['indrel_1mes'] = x_train['indrel_1mes'].replace(['P'],value=999)
x_train['indrel_1mes'] = x_train['indrel_1mes'].replace(['NA'],value=-999).astype('float')
x_test['indrel_1mes'] = x_test['indrel_1mes'].astype('str').map(str.strip)
x_test['indrel_1mes'] = x_test['indrel_1mes'].replace(['P'],value=999)
x_test['indrel_1mes'] = x_test['indrel_1mes'].replace(['NA'],value=-999).astype('float')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [231]:
# test renta value 처리

x_train['renta']= x_train['renta'].astype('str').map(str.strip).replace(['NA'],value = np.nan).astype(float64)
x_test['renta']= x_test['renta'].astype('str').map(str.strip).replace(['NA'],value = np.nan).astype(float64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [232]:
# 고객 별 방문 건수 
# x_train['n_counts'] = x_train['ncodpers'].map(lambda x : x_train['ncodpers'].value_counts())
# x_test['n_counts'] = x_train['ncodpers'].map(lambda x : x_train['ncodpers'].value_counts())
tmp = x_train['ncodpers'].value_counts().reset_index().rename(columns = {'ncodpers':'n_counts'})
x_train = pd.merge(x_train,tmp, how = 'left', left_on =['ncodpers'], right_on =['index'] )
x_test = pd.merge(x_test,tmp, how = 'left', left_on =['ncodpers'], right_on =['index'] )


In [233]:
y_train.shape

(1740978,)

In [234]:
# # 중복 row 제거
# distinct = list(pd.concat([x_train.drop(['fecha_dato','age'],axis = 1),y_train], axis =1 ).drop_duplicates().index)
# x_train = x_train.loc[distinct]
# y_train = y_train.loc[distinct]
# print(x_train.shape)

(array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 Index([        -999, '2015-07-02', '2015-07-23', '2015-07-06', '2015-07-30',
        '2015-07-20', '2015-07-08', '2015-07-22', '2015-07-17', '2015-07-09',
        ...
        '2016-02-22', '2016-03-30', '2016-03-07', '2016-04-05', '2016-04-18',
        '2016-04-04', '2016-04-26', '2016-04-20', '2016-04-22', '2016-04-15'],
       dtype='object', length=113))

In [236]:
# 
x_train['month'] = pd.DatetimeIndex(x_train['fecha_dato']).month
x_test['month'] = pd.DatetimeIndex(x_test['fecha_dato']).month

x_train['year'] = pd.DatetimeIndex(x_train['fecha_dato']).year
x_test['year'] = pd.DatetimeIndex(x_test['fecha_dato']).year


In [237]:
# fecha_ alta 로 월 주 
# 
x_train['start_month'] = pd.DatetimeIndex(x_train['fecha_alta']).month
x_test['start_month'] = pd.DatetimeIndex(x_test['fecha_alta']).month

x_train['start_year'] = pd.DatetimeIndex(x_train['fecha_alta']).year
x_test['start_year'] = pd.DatetimeIndex(x_test['fecha_alta']).year

#추가하기 


In [238]:
#tipodom /  cod_prov /fecha_dato
x_train = x_train.drop(['fecha_dato','fecha_alta','tipodom','cod_prov'],axis =1)
x_test = x_test.drop(['fecha_dato','fecha_alta','tipodom','cod_prov'],axis =1) 

In [239]:
# x_train[x_train['indfall']==2]#.value_counts(dropna = False)
# x_test[x_test['ncodpers']==1054155].T

In [255]:
x_train = x_train.fillna(-999)
x_test = x_test.fillna(-999)

In [256]:
# Label encoding , oneho   ->> factorize로 변경해보자
cat_cols= x_train.select_dtypes(include=['category']).columns #+ x_train.select_dtypes(include=['object']).columns
for col in tqdm(cat_cols) :
#     print(col)
    try :         
        x_train[col] = x_train[col].fillna(-999)
        x_test[col] = x_test[col].fillna(-999)
    except :
#         print(1)
        x_train[col] = x_train[col].cat.add_categories(-999).fillna(-999)
        x_test[col] = x_test[col].cat.add_categories(-999).fillna(-999)
#     print(df[col].isnull().sum())
    le = preprocessing.LabelEncoder()
    le.fit(list(x_train[col].values) + list(x_test[col].values))
    
    x_train[col] = le.transform(list(x_train[col].values))
    x_test[col] = le.transform(list(x_test[col].values))
    
#     print(le.transform(df[col]))

0it [00:00, ?it/s]


In [290]:
cols = x_train.dtypes[x_train.dtypes =='object'].index.tolist() 
for col in cols:
    x_train[col],_ = x_train[col].factorize()
    x_test[col],_ = x_test[col].factorize()

In [304]:
x_test =x_test.drop(['y'],axis =1)

In [None]:
%%time
n_fold = 3
folds = KFold(n_splits=n_fold, shuffle =True)
rounds =  5000
# seed = 99
params = {'booster' : 'gbtree',
          'n_estimators' : 500,
          'max_depth':8,
          'objective' : 'multi:softprob',
          'learning_rate':0.01,
          'subsample':0.85,
          'colsample_bytree':0.85,
          'missing':-999,
          'eval_metric' : 'mlogloss',
        'tree_method':'gpu_hist',  # THE MAGICAL PARAMETER
          'reg_alpha':0.15,
          'reg_lamdba':0.85,
          'n_jobs' :12 ,
          'seed' : 99,
          'num_class' :24,
          'verbosity' : 1
         }
preds_list = []
for fold, (train_idx,valid_idx) in enumerate(folds.split(x_train)):
#     print(fold)
#     clf= xgb.XGBClassifier(params)
    x_train_,x_valid_ = x_train.iloc[train_idx], x_train.iloc[valid_idx]
    y_train_,y_valid_ = y_train.iloc[train_idx], y_train.iloc[valid_idx]
#     print(y_train_)
    xgtrain = xgb.DMatrix(x_train_, label=y_train_.values)
    xgvalid = xgb.DMatrix(x_valid_, label=y_valid_.values)
    model= xgb.train(params, xgtrain, rounds,evals = [(xgtrain,'train'),(xgvalid,'valid')]
                     , verbose_eval = 30
                     , early_stopping_rounds = 50)#x_train_,y_train_.values)
#     print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
    del xgtrain, xgvalid
    xgtest = xgb.DMatrix(x_test)
    preds = model.predict(xgtest)
    model.__del__() 
#     del model , xgtest
    gc.collect()
    preds_list.append(preds)
#     clf.predict(x_valid)
    

[0]	train-mlogloss:3.10646	valid-mlogloss:3.10651
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[30]	train-mlogloss:2.11291	valid-mlogloss:2.11411
[60]	train-mlogloss:1.68238	valid-mlogloss:1.68445
[90]	train-mlogloss:1.41656	valid-mlogloss:1.41933
[120]	train-mlogloss:1.23321	valid-mlogloss:1.2366
[150]	train-mlogloss:1.09971	valid-mlogloss:1.10367
[180]	train-mlogloss:1.00041	valid-mlogloss:1.00488
[210]	train-mlogloss:0.926161	valid-mlogloss:0.931096
[240]	train-mlogloss:0.868993	valid-mlogloss:0.874377
[270]	train-mlogloss:0.824353	valid-mlogloss:0.830187
[300]	train-mlogloss:0.789463	valid-mlogloss:0.795724
[330]	train-mlogloss:0.762112	valid-mlogloss:0.768811
[360]	train-mlogloss:0.740278	valid-mlogloss:0.747408
[390]	train-mlogloss:0.722754	valid-mlogloss:0.730337


In [198]:

# 원래 보유하던 상품 찾기
cols =  [ 'ncodpers']+[prod +'_prev' for prod in prods] 
# test에 붙이는 것
tmp = pd.merge(x_test['ncodpers'],df[cols].groupby(['ncodpers']).max(), how = 'left', on =['ncodpers'])
tmp = tmp.fillna(0)
tmp = tmp.as_matrix(columns = [prod +'_prev' for prod in prods] )
tmp

  


array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [199]:

from datetime import datetime

target_cols = np.array(target_cols)
prob_list = np.mean(preds_list, axis =0)
prob_list -= tmp  # 원래 보유하던거 제거 / 확률에서 빼기
preds = np.argsort(prob_list, axis =1)

preds = np.fliplr(preds)[:,:7] #좌우 방향 변경 , 상위 7개 선정
test_id = np.array(pd.read_csv("test_ver2.csv", usecols=['ncodpers'])['ncodpers'])
f_preds =  [" ".join(list(target_cols[pred])) for pred in preds]
sub_df = pd.DataFrame({'ncodpers':test_id, 'added_products':f_preds})
sub_df.to_csv('submission_baseline_XGBOOST_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



In [None]:
import lightgbm as lgb
seed = 99
folds = 3
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class' :24,
    'metric': 'multi_logloss',
    
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
#     "num_threads" : 10,
    "max_bin" : 255,
    'seed' : seed
    
}

categorical_features = ["ncodpers","ind_empleado","pais_residencia","sexo","ind_nuevo","antiguedad","indrel","indrel_1mes"
                         ,"tiprel_1mes","indresi","indext","conyuemp","canal_entrada","indfall","nomprov","ind_actividad_cliente"
                        ,"segmento"]



In [None]:
%%time
preds_list = []
# shuffle = False

kf = KFold(n_splits=folds, shuffle=False, random_state=seed)
scores = [] 
models = []
for train_index, val_index in kf.split(x_train):
    train_X = x_train.iloc[train_index]
    val_X = x_train.iloc[val_index]
    train_y = y_train.iloc[train_index]
    val_y = y_train.iloc[val_index]
    print(train_X.shape , train_y.shape)
    print(val_X.shape , val_y.shape)
    lgb_train = lgb.Dataset(train_X, label = train_y.values,categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(val_X, label = val_y.values,categorical_feature=categorical_features)
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=2000, #300,
                    valid_sets=(lgb_train, lgb_eval),
#                     feval=rmsle,
                    early_stopping_rounds= 50,#100,
                    verbose_eval=30) #100)

    preds = model.predict(x_test)
    gc.collect()
    preds_list.append(preds)
    
    del gbm
    
    

In [None]:

from datetime import datetime
target_cols = np.array(target_cols)
preds = np.argsort(np.mean(preds_list, axis =0), axis =1)# 작은 순서대로 Index를 Return
preds = np.fliplr(preds)[:,:7] #좌우 방향 변경 , 상위 7개 선정
test_id = np.array(pd.read_csv("test_ver2.csv", usecols=['ncodpers'])['ncodpers'])
f_preds =  [" ".join(list(target_cols[pred])) for pred in preds]
sub_df = pd.DataFrame({'ncodpers':test_id, 'added_products':f_preds})
sub_df.to_csv('submission_baseline_lightgbm_' + str(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + '.csv', index=False)



In [None]:
sub_df