In [575]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [576]:
DIR_PATH = '/Users/carlosperezricardo/Documents/data/'
commercial_activity = pd.read_csv(DIR_PATH+'commercial_activity_df.csv', encoding='utf-8')
commercial_activity.drop(columns=['Unnamed: 0'], inplace=True)

products = pd.read_csv(DIR_PATH+'products_df.csv', encoding='utf-8')
products.drop(columns=['Unnamed: 0'], inplace=True)

sociodemographic = pd.read_csv(DIR_PATH+'sociodemographic_df.csv', encoding='utf-8')
sociodemographic.drop(columns=['Unnamed: 0'], inplace=True)

In [577]:
df_whole = pd.merge( commercial_activity, products, on = ['pk_cid','pk_partition'] )
df_whole = pd.merge( df_whole, sociodemographic, on=['pk_cid','pk_partition'] )

In [578]:
df_whole.head(3).T

Unnamed: 0,0,1,2
pk_cid,1375586,1050611,1050612
pk_partition,2018-01-28,2018-01-28,2018-01-28
entry_date,2018-01-12,2015-08-10,2015-08-10
entry_channel,KHL,KHE,KHE
active_customer,1,0,0
segment,02 - PARTICULARES,03 - UNIVERSITARIO,03 - UNIVERSITARIO
short_term_deposit,0,0,0
loans,0,0,0
mortgage,0,0,0
funds,0,0,0


In [579]:
partitions = ['2018-01-28','2018-02-28','2018-03-28','2018-04-28','2018-05-28','2018-06-28', \
    '2018-07-28','2018-08-28','2018-09-28','2018-10-28','2018-11-28','2018-12-28','2019-01-28', \
        '2019-02-28','2019-03-28','2019-04-28','2019-05-28']

list_products = ['short_term_deposit','loans','mortgage','funds','securities',
    'long_term_deposit','em_account_pp','credit_card','pension_plan',
    'payroll_account','emc_account','debit_card','em_account_p','em_acount']

products_dict = {"short_term_deposit":"ahorro e inversión", "loans":"financiación", "mortgage":"financiación", 
    "funds":"ahorro e inversión", "securities":"ahorro e inversión", "long_term_deposit":"ahorro e inversión", 
    "em_account_pp":"cuenta", "credit_card":"financiación", "payroll_account":"cuenta", "pension_plan":"ahorro e inversión", 
    "emc_account":"cuenta", "debit_card":"financiación", "em_account_p":"cuenta", "em_acount":"cuenta"}

cost_product = {'cuenta':10, 'ahorro e inversión':40, 'financiación':60}

In [580]:
def determinar_altas(data):
    data.columns=['pk_partition','pk_cid','product']
    data['prev'] = data.groupby('pk_cid')['product'].shift(1)
    data['diff'] = data['product'] - data['prev']

    # Solo queremos 1 
    data['diff'] = np.where( (data['product']==1) & (data['diff'].isna()) & (data['pk_partition']!='2018-01-28'), 1, data['diff'] )
    data['diff'].fillna(0,inplace=True)
    data['diff'] = np.where( data['diff'] == -1, 0, data['diff'] )

    return data['diff']

In [581]:
df = df_whole.copy(deep=True)

In [582]:
for prod in list_products:
    df[prod] = determinar_altas( df_whole[['pk_partition','pk_cid',prod]] )
    print(prod)


short_term_deposit
loans
mortgage
funds
securities
long_term_deposit
em_account_pp
credit_card
pension_plan
payroll_account
emc_account
debit_card
em_account_p
em_acount


In [583]:
# Total altas
df['total'] = df[list_products].sum(axis=1)

In [584]:
df = df.sort_values(['pk_cid','pk_partition'])
df.head(3).T

Unnamed: 0,1479563,2168122,2962973
pk_cid,15891,15891,16063
pk_partition,2018-07-28,2018-08-28,2018-11-28
entry_date,2018-07-28,2018-07-28,2018-11-19
entry_channel,KAT,KAT,KAT
active_customer,1,0,1
segment,,02 - PARTICULARES,
short_term_deposit,0,0,0
loans,0,0,0
mortgage,0,0,0
funds,0,0,0


# Tiempo desde la ultima compra

In [585]:
partitions

['2018-01-28',
 '2018-02-28',
 '2018-03-28',
 '2018-04-28',
 '2018-05-28',
 '2018-06-28',
 '2018-07-28',
 '2018-08-28',
 '2018-09-28',
 '2018-10-28',
 '2018-11-28',
 '2018-12-28',
 '2019-01-28',
 '2019-02-28',
 '2019-03-28',
 '2019-04-28',
 '2019-05-28']

In [586]:
def determinar_tiempo_desde_ultima_compra( data, partitions):
    # pk_cid, pk_partition, total
    data['pk_partition'] = data['pk_partition'].astype(object)
    altas = data[ data['total'] >= 1]

    #print(altas)

    altas_pt = pd.pivot_table(altas, values='total', index=['pk_cid'],
                    columns=['pk_partition'], aggfunc=np.max)
    
    #altas_pt.fillna( '2001-01-01', inplace=True)
    #print(altas_pt)
    #print(altas_pt.isnull().sum())
    
    data = pd.merge( data, altas_pt, how='left', on='pk_cid' )
    
    cols = ['pk_cid','pk_partition']
    data.columns = cols + partitions # pone las particiones como datetime
    
    for part, i in zip(partitions, range(len(partitions))):
        data[part] = np.where(data[part] >= 1, 1, data[part])

        # No han comprado nada ese mes
        data[part] = np.where(data[part] == 0, -1, data[part])
        # Han comprado 1 producto o mas
        data[part] = np.where(data[part] == 1, i, data[part])
        # No aparece info de ellos en esa partition NaN o no compraron nada
        data[part] = np.where(data[part].isna(), -1, data[part])
        
        data[part] = data[part].replace( {i:part, -1:'2001-01-01'})
#        print( data[part].value_counts() )
        data[part] = pd.to_datetime(data[part])

#    print(data)    
    #

    data['pk_partition'] = pd.to_datetime(data['pk_partition'])
    #data[partitions].fillna('2021-01-01', inplace=True)
    #print(data)

    ## CALCULAMOS TIEMPO DESDE ULTIMA COMPRA
    data['last_compra'] = datetime(2000,1,1)

    for part in partitions:
        data['last_compra'] = np.where(data[part] > data['last_compra'], data[part], data['last_compra'])

    #    data['tiempo_ult_compra']
    data['tiempo_ult_compra'] = round(((data['pk_partition'] - data['last_compra'])/np.timedelta64(1, 'M')))

    data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'] > 100, 20, data['tiempo_ult_compra'] )
    data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'] < 0, 20, data['tiempo_ult_compra'] )

    return data[['pk_cid','pk_partition','tiempo_ult_compra']]
    # Hay NaNs (los clientes que nunca han comprado nada)
    #data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'].isna(), 17, data['tiempo_ult_compra'] )

    # Hay negativos (clientes que compraron algo)
    #data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'].isna(), 17, data['tiempo_ult_compra'] )
    
    #print(data)

In [587]:
#determinar_tiempo_desde_ultima_compra( df[['pk_cid','pk_partition','total']], partitions)

In [588]:
data = df[['pk_cid','pk_partition','total']]

In [589]:
data['pk_partition'] = data['pk_partition'].astype(object)
altas = data[ data['total'] >= 1]


altas_pt = pd.pivot_table(altas, values='total', index=['pk_cid'],
                    columns=['pk_partition'], aggfunc=np.max)
    
    #altas_pt.fillna( '2001-01-01', inplace=True)
    #print(altas_pt)
    #print(altas_pt.isnull().sum())
    
data = pd.merge( data, altas_pt, how='left', on='pk_cid' )
    
cols = ['pk_cid','pk_partition']
data.columns = cols + partitions # pone las particiones como datetime

for part, i in zip(partitions, range(len(partitions))):
    data[part] = np.where(data[part] >= 1, 1, data[part])

    # No han comprado nada ese mes
    data[part] = np.where(data[part] == 0, -1, data[part])
    # Han comprado 1 producto o mas
    data[part] = np.where(data[part] == 1, i, data[part])
    # No aparece info de ellos en esa partition NaN o no compraron nada
    data[part] = np.where(data[part].isna(), -1, data[part])
        
    data[part] = data[part].replace( {i:part, -1:'2001-01-01'})
#   print( data[part].value_counts() )
    data[part] = pd.to_datetime(data[part])



In [590]:
data['pk_partition'] = pd.to_datetime(data['pk_partition'])
    #data[partitions].fillna('2021-01-01', inplace=True)
    #print(data)

## CALCULAMOS TIEMPO DESDE ULTIMA COMPRA
data['last_compra'] = datetime(2001,1,1)

for part in partitions:
    data['last_compra'] = np.where( (data[part] > data['last_compra']) & (part <= data['pk_partition']), data[part], data['last_compra'])

data['tiempo_ult_compra'] = round(((data['pk_partition'] - data['last_compra'])/np.timedelta64(1, 'M')))

data['prev'] = data.groupby('pk_cid')['tiempo_ult_compra'].shift(1)

data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra']==0, data['prev']+1, data['tiempo_ult_compra'])

In [591]:
data[ data['pk_cid'] == 17457]

Unnamed: 0,pk_cid,pk_partition,2018-01-28,2018-02-28,2018-03-28,2018-04-28,2018-05-28,2018-06-28,2018-07-28,2018-08-28,...,2018-11-28,2018-12-28,2019-01-28,2019-02-28,2019-03-28,2019-04-28,2019-05-28,last_compra,tiempo_ult_compra,prev
24,17457,2018-01-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,205.0,
25,17457,2018-02-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,206.0,205.0
26,17457,2018-03-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,207.0,206.0
27,17457,2018-04-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,208.0,207.0
28,17457,2018-05-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,209.0,208.0
29,17457,2018-06-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,210.0,209.0
30,17457,2018-07-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,211.0,210.0
31,17457,2018-08-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2001-01-01,212.0,211.0
32,17457,2018-09-28,2018-01-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2018-09-28,213.0,212.0
33,17457,2018-10-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2018-11-28,2001-01-01,2019-01-28,2001-01-01,2019-03-28,2001-01-01,2019-05-28,2018-09-28,1.0,0.0


In [592]:
data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'] < 0, 20, data['tiempo_ult_compra'] )
data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'] > 100, 20, data['tiempo_ult_compra'] )
data['tiempo_ult_compra'] = np.where( data['tiempo_ult_compra'].isna(), 20, data['tiempo_ult_compra'] )

In [593]:
data

Unnamed: 0,pk_cid,pk_partition,2018-01-28,2018-02-28,2018-03-28,2018-04-28,2018-05-28,2018-06-28,2018-07-28,2018-08-28,...,2018-11-28,2018-12-28,2019-01-28,2019-02-28,2019-03-28,2019-04-28,2019-05-28,last_compra,tiempo_ult_compra,prev
0,15891,2018-07-28,2018-01-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2018-07-28,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2018-07-28,20.0,
1,15891,2018-08-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2018-07-28,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2018-07-28,1.0,0.0
2,16063,2018-11-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,
3,16063,2018-12-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,215.0
4,16063,2019-01-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962919,1553685,2019-05-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,
5962920,1553686,2019-05-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,
5962921,1553687,2019-05-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,
5962922,1553688,2019-05-28,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,...,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,20.0,


In [594]:
data_to_join = data[['pk_cid','pk_partition','tiempo_ult_compra']]
#data_to_join['pk_partition'] = data_to_join['pk_partition']

In [595]:
df['pk_partition'] = pd.to_datetime(df['pk_partition'])

In [596]:
df = pd.merge(df, data_to_join, on=['pk_cid','pk_partition'], how='left')

In [597]:
df['inversion'] = 0
for key, value in products_dict.items():
    df['inversion'] += df[key]*cost_product[value]

# Nulos

In [598]:
df.isnull().sum()

pk_cid                      0
pk_partition                0
entry_date                  0
entry_channel          133033
active_customer             0
segment                133944
short_term_deposit          0
loans                       0
mortgage                    0
funds                       0
securities                  0
long_term_deposit           0
em_account_pp               0
credit_card                 0
payroll                    61
pension_plan                0
payroll_account             0
emc_account                 0
debit_card                  0
em_account_p                0
em_acount                   0
country_id                  0
region_code              2264
gender                     25
age                         0
deceased                    0
salary                1512103
total                       0
tiempo_ult_compra           0
inversion                   0
dtype: int64

In [599]:
df['salary'].fillna(-9999, inplace=True)

In [600]:
df = df[ df['deceased'] != 'S']

In [601]:
df = df[ (df['age'] >= 18) & (df['age'] <= 90)]

In [602]:
cols_to_drop = ['entry_channel','segment','entry_date','gender','payroll','deceased']
df.drop(cols_to_drop, axis=1, inplace=True)
df.drop(list_products, axis=1, inplace=True)

In [603]:
df.shape

(5921197, 10)

In [604]:
df.columns

Index(['pk_cid', 'pk_partition', 'active_customer', 'country_id',
       'region_code', 'age', 'salary', 'total', 'tiempo_ult_compra',
       'inversion'],
      dtype='object')

In [605]:
df['region_code'].fillna(-99, inplace=True)
#df['region_code'].fillna(-99, inplace=True)

In [606]:
df.isnull().sum()

pk_cid               0
pk_partition         0
active_customer      0
country_id           0
region_code          0
age                  0
salary               0
total                0
tiempo_ult_compra    0
inversion            0
dtype: int64

In [607]:
from sklearn.preprocessing import LabelEncoder
df["country_id"] = LabelEncoder().fit_transform(df["country_id"])

In [608]:
df['year'] = df['pk_partition'].dt.year
df['month'] = df['pk_partition'].dt.month
df.drop('pk_partition', axis=1, inplace=True)

In [609]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5921197 entries, 0 to 5962923
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   pk_cid             int64  
 1   active_customer    float64
 2   country_id         int64  
 3   region_code        float64
 4   age                int64  
 5   salary             float64
 6   total              float64
 7   tiempo_ult_compra  float64
 8   inversion          float64
 9   year               int64  
 10  month              int64  
dtypes: float64(6), int64(5)
memory usage: 542.1 MB


In [610]:
df['total'].value_counts()

0.0    5642911
1.0     255379
2.0      21010
3.0       1821
4.0         71
5.0          5
Name: total, dtype: int64

In [611]:
subset_positives = df[ df['total'] == 1 ]
subset_negatives = df[ df['total'] == 0 ].sample( subset_positives.shape[0]*5 )

subset_negatives.shape

(1276895, 11)

In [612]:
target = 'total'

In [613]:
df_final = subset_negatives.copy(deep=True)
df_final = df_final.append( subset_positives, ignore_index=True )

df_final.shape

(1532274, 11)

In [614]:
df_final['total'].value_counts()

0.0    1276895
1.0     255379
Name: total, dtype: int64

In [615]:
df_final.head(3).T

Unnamed: 0,0,1,2
pk_cid,1217904.0,1504821.0,1078654.0
active_customer,0.0,0.0,0.0
country_id,15.0,15.0,15.0
region_code,28.0,15.0,41.0
age,24.0,20.0,23.0
salary,242661.03,-9999.0,-9999.0
total,0.0,0.0,0.0
tiempo_ult_compra,20.0,20.0,20.0
inversion,0.0,0.0,0.0
year,2018.0,2019.0,2018.0


In [616]:
df_final['id'] = range(df_final.shape[0])

In [617]:
cost_10 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.8))))
cost_60 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.1))))

print(len(cost_10))
print(len(cost_60))

1225819
153227


In [618]:
tiempo_1 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.2))))
tiempo_2 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.2))))
tiempo_3 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.2))))
tiempo_4 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.2))))
tiempo_5 = set(list(df_final['id'].sample(int(df_final.shape[0]*0.2))))

print(len(tiempo_1))

306454


In [619]:
df_final['inversion'] = np.where( (df_final['inversion'] == 0) & (df_final['id'].isin(cost_10)), 10, df_final['inversion'])
df_final['inversion'] = np.where( (df_final['inversion'] == 0) & (df_final['id'].isin(cost_60)), 60, df_final['inversion'])
df_final['inversion'] = np.where( df_final['inversion'] == 0, 40, df_final['inversion'])

In [620]:
df_final['tiempo_ult_compra'] = np.where( (df_final['tiempo_ult_compra'] == 20) & (df_final['id'].isin(tiempo_1)), 1, df_final['tiempo_ult_compra'])
df_final['tiempo_ult_compra'] = np.where( (df_final['tiempo_ult_compra'] == 20) & (df_final['id'].isin(tiempo_2)), 2, df_final['tiempo_ult_compra'])
df_final['tiempo_ult_compra'] = np.where( (df_final['tiempo_ult_compra'] == 20) & (df_final['id'].isin(tiempo_3)), 3, df_final['tiempo_ult_compra'])
df_final['tiempo_ult_compra'] = np.where( (df_final['tiempo_ult_compra'] == 20) & (df_final['id'].isin(tiempo_4)), 4, df_final['tiempo_ult_compra'])
df_final['tiempo_ult_compra'] = np.where( (df_final['tiempo_ult_compra'] == 20) & (df_final['id'].isin(tiempo_5)), 5, df_final['tiempo_ult_compra'])
df_final['tiempo_ult_compra'] = np.where( df_final['tiempo_ult_compra'] == 20, 3, df_final['tiempo_ult_compra'])

In [621]:
df_final['inversion'].value_counts()

10.0    1169601
40.0     261009
60.0     101664
Name: inversion, dtype: int64

In [622]:
df_final['tiempo_ult_compra'].value_counts()

3.0     579767
1.0     307729
2.0     251264
4.0     160326
5.0     128336
6.0      29210
7.0      23812
8.0      17992
9.0      12250
10.0      8204
11.0      4780
12.0      3648
13.0      2600
14.0      1578
15.0       778
Name: tiempo_ult_compra, dtype: int64

In [623]:
features = df_final.columns.to_list()
features.remove(target)
features.remove('pk_cid')
features.remove('id')
features.remove('country_id') 
#features.remove('tiempo_ult_compra') 

In [624]:
corr = df_final.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

Unnamed: 0,pk_cid,active_customer,country_id,region_code,age,salary,total,tiempo_ult_compra,inversion,year,month,id
pk_cid,1.0,0.025,-0.003,0.041,-0.106,-0.102,0.218,0.076,-0.04,0.12,0.052,0.167
active_customer,0.025,1.0,-0.001,0.025,0.288,0.011,0.315,0.071,0.152,-0.024,-0.025,0.196
country_id,-0.003,-0.001,1.0,-0.035,0.001,-0.002,-0.001,-0.001,0.002,0.002,-0.0,-0.0
region_code,0.041,0.025,-0.035,1.0,0.015,-0.016,0.02,0.002,0.003,0.002,0.003,0.014
age,-0.106,0.288,0.001,0.015,1.0,0.019,0.135,0.042,0.078,0.03,-0.026,0.083
salary,-0.102,0.011,-0.002,-0.016,0.019,1.0,-0.034,0.004,0.013,-0.029,-0.003,-0.028
total,0.218,0.315,-0.001,0.02,0.135,-0.034,1.0,-0.027,0.289,-0.078,0.092,0.645
tiempo_ult_compra,0.076,0.071,-0.001,0.002,0.042,0.004,-0.027,1.0,0.01,0.223,-0.074,-0.021
inversion,-0.04,0.152,0.002,0.003,0.078,0.013,0.289,0.01,1.0,0.01,-0.012,0.167
year,0.12,-0.024,0.002,0.002,0.03,-0.029,-0.078,0.223,0.01,1.0,-0.59,-0.048


In [625]:
df_final['tiempo_ult_compra'].isnull().sum()

0

In [626]:
# Frequency encoding de region_code
region_vc = df_final['region_code'].value_counts()
df_final['region_code'] = df_final['region_code'].replace(region_vc)
df_final['region_code']

0          312364.0
1           64214.0
2           74010.0
3          312364.0
4           74614.0
             ...   
1532269    312364.0
1532270      7858.0
1532271    151301.0
1532272     26201.0
1532273     64214.0
Name: region_code, Length: 1532274, dtype: float64

In [627]:
X = df_final[ features ]
y = df_final[ target ]

In [628]:
features

['active_customer',
 'region_code',
 'age',
 'salary',
 'tiempo_ult_compra',
 'inversion',
 'year',
 'month']

In [629]:
df_final

Unnamed: 0,pk_cid,active_customer,country_id,region_code,age,salary,total,tiempo_ult_compra,inversion,year,month,id
0,1217904,0.0,15,312364.0,24,242661.03,0.0,3.0,10.0,2018,10,0
1,1504821,0.0,15,64214.0,20,-9999.00,0.0,5.0,40.0,2019,1,1
2,1078654,0.0,15,74010.0,23,-9999.00,0.0,2.0,10.0,2018,3,2
3,1110770,1.0,15,312364.0,41,181481.97,0.0,3.0,10.0,2018,3,3
4,1062030,0.0,15,74614.0,31,-9999.00,0.0,3.0,10.0,2019,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1532269,1553456,1.0,15,312364.0,32,-9999.00,1.0,1.0,10.0,2019,5,1532269
1532270,1553541,1.0,15,7858.0,54,-9999.00,1.0,5.0,10.0,2019,5,1532270
1532271,1553559,1.0,15,151301.0,43,-9999.00,1.0,3.0,10.0,2019,5,1532271
1532272,1553565,0.0,15,26201.0,68,-9999.00,1.0,1.0,10.0,2019,5,1532272


In [630]:
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.tree import DecisionTreeClassifier

# Validation and Train-Test
X_train_test, X_validation, y_train_test, y_validation =  train_test_split(X, y, test_size=0.1, random_state=42)

# Train and Test
X_train, X_test, y_train, y_test =  train_test_split(X_train_test, y_train_test, test_size=0.2, random_state=42)

In [657]:
dt = DecisionTreeClassifier( max_depth=10, min_samples_leaf=150, random_state=42 )
dt.fit(X_train,y_train)

y_train_pred = dt.predict(X_train)
y_train_score = dt.predict_proba(X_train)[:,1]

#Predict the response for test dataset
y_pred = dt.predict(X_test)
y_test_score = dt.predict_proba(X_test)[:,1]

In [658]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, roc_curve

In [659]:
def metricas(y_test, y_pred, y_test_score, y_train, y_train_pred, y_train_score):
    print("----------- TEST -----------")
    print("Accuracy:",accuracy_score(y_test, y_pred))
    print("Precision:",precision_score(y_test, y_pred))
    print("Recall:",recall_score(y_test, y_pred))
    print("F1-score:",f1_score(y_test, y_pred))
    print("ROC AUC score:",roc_auc_score(y_test, y_test_score))
    print("----------- TRAIN -----------")
    print("Accuracy:",accuracy_score(y_train, y_train_pred))
    print("Precision:",precision_score(y_train, y_train_pred))
    print("Recall:",recall_score(y_train, y_train_pred))
    print("F1-score:",f1_score(y_train, y_train_pred))
    print("ROC AUC score:",roc_auc_score(y_train, y_train_score))

In [660]:
metricas(y_test, y_pred, y_test_score, y_train, y_train_pred, y_train_score)


----------- TEST -----------
Accuracy: 0.8878648344875095
Precision: 0.7701075499338979
Recall: 0.4680652376919235
F1-score: 0.5822459950833401
ROC AUC score: 0.902129434520581
----------- TRAIN -----------
Accuracy: 0.8892440058156188
Precision: 0.7727155544097645
Recall: 0.4735732381704759
F1-score: 0.5872433571819453
ROC AUC score: 0.9044613858632352


In [661]:
pd.Series( dt.feature_importances_, index=features ).sort_values(ascending=False).head(20)

inversion            0.458323
active_customer      0.186545
month                0.151407
age                  0.097900
salary               0.047999
tiempo_ult_compra    0.024165
region_code          0.016998
year                 0.016663
dtype: float64

In [636]:
from sklearn.tree import export_graphviz

In [637]:
#df[ df['pk_cid'] == 17457][['pk_partition','total']]

In [638]:
#16502