In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
import sklearn.preprocessing as preprocessing
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

to_show = True
to_save = False
is_sample = False
limit_rows = 1000

In [2]:
# This data cleaning part gets ideas from https://www.kaggle.com/apryor6/detailed-cleaning-visualization-python
# HAN YIKAI takes charge of feature 1-9
# official file
train_file = '../input/train_ver2.csv'
test_file = '../input/test_ver2.csv'

out_path = '../output/'


def save_df2file(data_frame: pd.DataFrame, name: str):
    data_frame.to_csv(f'{out_path}{name}.csv')

# sns.set(rc = {'figure.figsize':(20,18)})


In [3]:
# test_df = pd.read_csv(test_file,
#                       dtype={'sexo': str,
#                              'ind_nuevo': str,
#                              'ult_fec_cli_1t': str,
#                              'indext': str},
#                       # nrows=limit_rows,
#                       low_memory=False
#                       )

In [4]:
df = pd.read_csv(train_file,
                 dtype={'sexo': str,
                        'ind_nuevo': str,
                        'ult_fec_cli_1t': str,
                        'indext': str},
                 # nrows=limit_rows,
                 low_memory=False
                 )

# resample the data for saving memory and shuffle the dataset
if is_sample:
    df = df.sample(frac=0.5)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   fecha_dato             object 
 1   ncodpers               int64  
 2   ind_empleado           object 
 3   pais_residencia        object 
 4   sexo                   object 
 5   age                    object 
 6   fecha_alta             object 
 7   ind_nuevo              object 
 8   antiguedad             object 
 9   indrel                 float64
 10  ult_fec_cli_1t         object 
 11  indrel_1mes            object 
 12  tiprel_1mes            object 
 13  indresi                object 
 14  indext                 object 
 15  conyuemp               object 
 16  canal_entrada          object 
 17  indfall                object 
 18  tipodom                float64
 19  cod_prov               float64
 20  nomprov                object 
 21  ind_actividad_cliente  float64
 22  renta           

In [5]:
# 复用函数

# 统计缺失值的列
def cal_loss(data):
    fil = data.iloc[:, :24].isnull().any()
    loss_column = fil[fil == True].index
    res = data.loc[:, loss_column].isnull().sum()
    print(res)

#
# def renew_df(base_df: pd.DataFrame):
#     new_df = base_df
#     base_df = df.copy(deep=True)
#     return new_df

In [6]:
# 调整label的格式节省空间, 能节约2.1G 空间, 将近一半
df.iloc[:, 24:] = df.iloc[:, 24:].astype(bool)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   fecha_dato             object 
 1   ncodpers               int64  
 2   ind_empleado           object 
 3   pais_residencia        object 
 4   sexo                   object 
 5   age                    object 
 6   fecha_alta             object 
 7   ind_nuevo              object 
 8   antiguedad             object 
 9   indrel                 float64
 10  ult_fec_cli_1t         object 
 11  indrel_1mes            object 
 12  tiprel_1mes            object 
 13  indresi                object 
 14  indext                 object 
 15  conyuemp               object 
 16  canal_entrada          object 
 17  indfall                object 
 18  tipodom                float64
 19  cod_prov               float64
 20  nomprov                object 
 21  ind_actividad_cliente  float64
 22  renta           

In [8]:
# fecha_dato 日期
# ncodpers 客户代码
# ind_empleado 员工
# pais_residencia 国藉
# sexo 性别,
# age 年龄
# fecha_alta 注册日期,
# ind_nuevo 新的,
# antiguedad 古代,
# indrel 最后访问日期, 一个月indrel
# ult_fec_cli_1t  作为主要客户的日期
# indrel_1mes  1 (First/Primary customer), 2 (co-owner ),P (Potential),3 (former primary), 4(former co-owner)
# tiprel_1mes  月初客户关系类型 A (active), I (inactive), P (former customer 前客户),R (Potential 潜在客户)
# indresi 居住指数S (Yes) 银行和居住国相同 N (No) 不同国家
# indext  外国人指数, S 本国出生, N 非本国
# conyuemp 配偶指数, S 客户是员工配偶
# canal_entrada
# indfall
# tipodom
# cod_prov 省份编码
# nomprov 省份名称
# ind_actividad_cliente
# renta 家庭总收入
# segmento segmentation: 01 - VIP, 02 - Individuals 03 - college graduated

In [9]:
# cal_loss(df)

In [10]:
# 删除27734条空行, 所有全空
df.drop(df[df['ind_empleado'].isnull()].index, inplace=True)
# 删除全other的空行, 所有全空
df.drop(df[df['pais_residencia'] == 'others'].index, inplace=True)

In [11]:
# 缺失值很少的删除行
df.drop(df[df['sexo'].isnull()].index, inplace=True)
df.drop(df[df['tipodom'].isnull()].index, inplace=True)

In [12]:
# 缺失值很多的删除列
df.drop('ult_fec_cli_1t', axis=1, inplace=True)
df.drop('conyuemp', axis=1, inplace=True)

In [13]:
# 省份名称直接删除, 省份编码缺失值用-1填充
df.drop('nomprov', axis=1, inplace=True)
df.cod_prov.fillna(-1, inplace=True)

In [14]:
# df[df['indrel_1mes'].isna()].iloc[:, :df.shape[1] - 24]

In [15]:
# canal_entrada 用户加入渠道非常多162种 而且比较平衡, 还是用unknown填充, 考虑KNN填充
df.canal_entrada.value_counts()
df.canal_entrada.fillna('unknown', inplace=True)

KHE    4055270
KAT    3268209
KFC    3098360
KHQ     591039
KFA     409669
        ...   
KGN         17
KDL         11
025         11
KHS          5
KHR          1
Name: canal_entrada, Length: 162, dtype: int64

In [16]:
df.renta

0            87218.10
1            35548.74
2           122179.11
3           119775.54
4                 NaN
              ...    
13647304     43912.17
13647305     23334.99
13647306          NaN
13647307    199592.82
13647308          NaN
Name: renta, Length: 13619504, dtype: float64

In [17]:
# renta 数值型填充
# sns.scatterplot(df.renta, df.renta.value_counts())
# df.renta.plot.scatter(x='renta', y=df.renta.value_counts())

In [18]:
# TODO
df.renta.fillna(df.renta.mean(), inplace=True)

In [19]:
df.indrel_1mes.fillna('unknown', inplace=True)
df.tiprel_1mes.fillna('unknown', inplace=True)

In [20]:
# 23. segmento : 01 - VIP, 02 - 个人 03 - 大学毕业
df.segmento.fillna('00', inplace=True)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13619504 entries, 0 to 13647308
Data columns (total 45 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   fecha_dato             object 
 1   ncodpers               int64  
 2   ind_empleado           object 
 3   pais_residencia        object 
 4   sexo                   object 
 5   age                    object 
 6   fecha_alta             object 
 7   ind_nuevo              object 
 8   antiguedad             object 
 9   indrel                 float64
 10  indrel_1mes            object 
 11  tiprel_1mes            object 
 12  indresi                object 
 13  indext                 object 
 14  canal_entrada          object 
 15  indfall                object 
 16  tipodom                float64
 17  cod_prov               float64
 18  ind_actividad_cliente  float64
 19  renta                  float64
 20  segmento               object 
 21  ind_ahor_fin_ult1      bool   
 22  ind_aval_fin_ult

In [22]:
cal_loss(df)

Series([], dtype: float64)


# 编码

In [23]:
# 0. fecha_dato 日期
# df.fecha_dato.value_counts(dropna=False).plot(kind='bar')
# 1. ncodpers 客户代码, 弃用
# df.ncodpers.value_counts(dropna=False)

In [24]:
# 日期和客户代码删除
df.drop(['fecha_dato', 'ncodpers'], axis=1, inplace=True)

In [25]:
def label_enc(data):
    label_enc = preprocessing.LabelEncoder()  #获取一个LabelEncoder
    label_data = label_enc.fit_transform(data)
    return np.array(label_data).reshape(-1, 1)


# def concat_encoder(df):
#     data = pd.concat([data, pd.DataFrame(arrays, columns=names)],axis=1)
#     data = data.drop(['hour'],axis=1)
#     return df

# 对单列编码
def oh_enc(data):
    label_data = label_enc(data)
    # 2. 再用onehot
    oh_enc = preprocessing.OneHotEncoder()
    name_list = []
    for i in range(label_data.classes_.shape[0]):
        name = f'{data.name}_{i}'
        name_list.append(name)
    # 给fit_transform传递的X一定要是 2D的，即（samples，features），否则会报错`
    onehot_data = oh_enc.fit_transform(label_data)
    # 转成numpy array格式
    onehot_data = onehot_data.toarray()
    enc_df = pd.DataFrame(onehot_data, columns=name_list)
    return enc_df

In [26]:

# 2. 5 ind_empleado 员工指标,
# 树模型直接不处理
# A active 活跃, B ex employed 前雇员, F filial 子女, N not employee 非雇员, P pasive 被动, 目前没有寻找新工作, 但是对新工作持开放态度

# df.ind_empleado.value_counts(dropna=False)
# enc_array = label_enc(df.ind_empleado)
def get_ordinal_name(data) -> list:
    return [f'{data.name}_enc']


def get_onehot_name(data) -> list:
    length = data.unique().shape[0]
    l = []
    for i in range(length):
        l.append(f'{data.name}_oh_{i}')
    return l


def new_ordinal_df(data: pd.Series, array):
    name: list = get_ordinal_name(data)
    return pd.DataFrame(array, index=data.index, columns=name)


# TODO
def new_onehot_df(data: pd.Series, array):
    name: list = get_ordinal_name(data)
    return pd.DataFrame(array, index=data.index, columns=name)


# TODO
def show_value_counts(df, od_set: set, oh_set: set):
    df[get_ordinal_name(df.ind_empleado)].value_counts(dropna=False)


od_set = set()
oh_set = set()

In [27]:
l = get_onehot_name(df.ind_empleado)
e = get_ordinal_name(df.ind_empleado)
l
e

['ind_empleado_oh_0',
 'ind_empleado_oh_1',
 'ind_empleado_oh_2',
 'ind_empleado_oh_3',
 'ind_empleado_oh_4']

['ind_empleado_enc']

In [28]:
# df.drop(get_ordinal_name(df.ind_empleado), axis=1, inplace=True)

In [29]:
# df[get_ordinal_name(df.ind_empleado)].value_counts()

df.ind_empleado.value_counts(dropna=False)

N    13610906
B        3566
F        2523
A        2492
S          17
Name: ind_empleado, dtype: int64

In [30]:
# 单列编码
# df = pd.concat([df, new_ordinal_df(df.ind_empleado, label_enc(df.ind_empleado))], axis=1)
# df[get_ordinal_name(df.ind_empleado)].value_counts(dropna=False)

# 用list收集, 统一进行编码处理
od_set.add(df.ind_empleado.name)

In [31]:
# 3. 118 pais_residencia 客户国籍 ES占比巨大
# 发现others国家的人所有数据都是others
# 继续处理
# Nan unknown
# TODO 计算占比
# df.pais_residencia.value_counts(dropna=False)
# df.pais_residencia.unique()

# 用others替换其他
df.loc[(df['pais_residencia'] != 'ES'), 'pais_residencia'] = 'others'

# df[df['pais_residencia']=='ES'].sum()
df.pais_residencia.value_counts(dropna=False)

od_set.add(df.pais_residencia.name)

ES        13553656
others       65848
Name: pais_residencia, dtype: int64

In [32]:
# 4. sexo
# Nan = unknown
# 无序类别
df.sexo.value_counts(dropna=False)
od_set.add(df.sexo.name)

V    7424251
H    6195253
Name: sexo, dtype: int64

In [33]:
df.age = df.age.astype('int8')

In [34]:
# 5. age 年龄
# 清洗>100的异常值
# df.age.value_counts(dropna=False)
low_age = df.loc[df.age < 16, 'age']
low_age.mean()

# df['age'].unique()

11.080660089610635

In [35]:
df.fecha_alta.value_counts(dropna=False)

2014-07-28    57389
2014-10-03    54287
2014-08-04    45746
2013-10-14    40804
2013-08-03    33414
              ...  
2013-05-11       11
2015-05-31        9
2014-05-01        7
2010-07-04        4
2009-12-25        4
Name: fecha_alta, Length: 6756, dtype: int64

In [36]:
pd.DatetimeIndex(df["fecha_alta"]).month.value_counts()

10    2222408
9     1599044
11    1539133
7     1468729
8     1365917
12    1018032
1      847092
2      761082
3      758132
4      714992
5      666610
6      658333
Name: fecha_alta, dtype: int64

In [37]:
temp = df.fecha_alta.astype('datetime64[M]')
temp.value_counts()

2013-10-01    382473
2014-10-01    340613
2012-10-01    310295
2014-09-01    294752
2011-10-01    268133
               ...  
1995-08-01      3764
1995-04-01      2171
1995-03-01      1705
1995-01-01      1299
1995-02-01      1106
Name: fecha_alta, Length: 257, dtype: int64

In [38]:
df.antiguedad.value_counts(dropna=False)
od_set.add(df.antiguedad.name)

     12    243159
     21    214795
     10    206164
      9    177955
     23    177839
            ...  
    253       416
    254       261
    255       179
    256       102
-999999        38
Name: antiguedad, Length: 258, dtype: int64

In [39]:
df.fecha_alta.info()

<class 'pandas.core.series.Series'>
Int64Index: 13619504 entries, 0 to 13647308
Series name: fecha_alta
Non-Null Count     Dtype 
--------------     ----- 
13619504 non-null  object
dtypes: object(1)
memory usage: 207.8+ MB


In [40]:
x = pd.to_datetime(df.fecha_alta)
x.info()

<class 'pandas.core.series.Series'>
Int64Index: 13619504 entries, 0 to 13647308
Series name: fecha_alta
Non-Null Count     Dtype         
--------------     -----         
13619504 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 207.8 MB


In [41]:
# 6.fecha_alta 客户首次签订合同的日期, 可以用距今差值表示, 总结成月份, 然后判断和 antiguedad客户资历的关系
df.antiguedad.value_counts(dropna=False)
df.antiguedad.info()

# TODO 暂时先删除
df.drop('fecha_alta', inplace=True, axis=1)
# df.fecha_alta.isnull().sum()
# months_active = df.loc[df["ind_nuevo"].isnull(), :].groupby("ncodpers", sort=False).size()
# months_active.max()

     12    243159
     21    214795
     10    206164
      9    177955
     23    177839
            ...  
    253       416
    254       261
    255       179
    256       102
-999999        38
Name: antiguedad, Length: 258, dtype: int64

<class 'pandas.core.series.Series'>
Int64Index: 13619504 entries, 0 to 13647308
Series name: antiguedad
Non-Null Count     Dtype 
--------------     ----- 
13619504 non-null  object
dtypes: object(1)
memory usage: 207.8+ MB


In [42]:

# 7.ind_nuevo 新客户指数 2. 如果是6个月内注册的客户, 则为1
df.ind_nuevo = df.ind_nuevo.astype('int8')
df.ind_nuevo.value_counts(dropna=False)

od_set.add(df.ind_nuevo.name)

0    12808310
1      811194
Name: ind_nuevo, dtype: int64

In [43]:
# 8. antiguedad 客户资历, 单位是月
df.antiguedad.value_counts(dropna=False)
od_set.add(df.antiguedad.name)

     12    243159
     21    214795
     10    206164
      9    177955
     23    177839
            ...  
    253       416
    254       261
    255       179
    256       102
-999999        38
Name: antiguedad, Length: 258, dtype: int64

In [44]:
# 6,7,8 缺失值数量相等已经删除

In [45]:
# 9.indrel  2:  1表示primary, 99表示当月primary,但不是月末?
# Nan =unknown
# one-hot label编码?
df.indrel = df.indrel.astype('int8')
df.indrel.value_counts(dropna=False)
od_set.add(df.indrel.name)

1     13594711
99       24793
Name: indrel, dtype: int64

In [46]:
# 10. ult_fec_cli_1t 作为主要客户的日期
# 缺失值太多, 丢弃
# df.ult_fec_cli_1t.value_counts(dropna=False)
# df.ult_fec_cli_1t.isnull().sum()

In [47]:
# df.replace('unknown', -2, inplace=True)
# df.replace('P', -3, inplace=True)
# 必须先修改类型, 不然replace匹配不上
# df.indrel_1mes = df.indrel_1mes.astype('float')
df.indrel_1mes.replace({'1.0': '1', '2.0': '2', '3.0': '3', '4.0': '4'}, inplace=True)  # 不同key用不同value替换
df.indrel_1mes.value_counts(dropna=False)
od_set.add(df.indrel_1mes.name)

1          13490612
unknown      122047
3              4349
2              1317
P               873
4               306
Name: indrel_1mes, dtype: int64

In [48]:
# 12.tiprel_1mes 月初客户关系类型 A (active), I (inactive), P (former customer 前客户),R (Potential 潜在客户)
# Nan = unknown
# label 编码
df.tiprel_1mes.value_counts(dropna=False)
od_set.add(df.tiprel_1mes.name)

I          7304864
A          6187065
unknown     122047
P             4655
R              869
N                4
Name: tiprel_1mes, dtype: int64

In [49]:
# 13. indresi 居住指数S (Yes) 银行和居住国相同 N (No) 不同国家
# label编码, S=1, N=0
df.indresi.value_counts(dropna=False)
od_set.add(df.indresi.name)


S    13553657
N       65847
Name: indresi, dtype: int64

In [50]:
# 14. indext 外国人指数, S 本国出生, N 非本国
# label 编码
df.indext.value_counts(dropna=False)
od_set.add(df.indext.name)


N    12974768
S      644736
Name: indext, dtype: int64

In [51]:
# 15. conyuemp 配偶指数, S 客户是员工配偶
# 缺失太多, 删除列
# df.conyuemp.value_counts(dropna=False)

In [52]:
# 16. canal_entrada	客户渠道 (拉新渠道)
# Nan = unknown, 占比小于3%都是other
# 用占比, 占比低的作为others
df.canal_entrada.value_counts(dropna=False)
# df.canal_entrada.isnull().sum()
# print(100 * round(df.canal_entrada.value_counts(dropna=False) / df.shape[0], 8))
od_set.add(df.canal_entrada.name)

KHE    4055270
KAT    3268209
KFC    3098360
KHQ     591039
KFA     409669
        ...   
KDI         17
KDL         11
025         11
KHS          5
KHR          1
Name: canal_entrada, Length: 163, dtype: int64

In [53]:
# 17. indfall 已故索引 N/S
df.indfall.value_counts(dropna=False)
od_set.add(df.indfall.name)


N    13584742
S       34762
Name: indfall, dtype: int64

In [54]:
# 18. tipodom 地址类型. 1, primary address 弃用
# Nan dropna
# 没有区分度, 删除列
# label编码
df.tipodom.value_counts(dropna=False)
df.drop('tipodom', axis=1, inplace=True)

1.0    13619504
Name: tipodom, dtype: int64

In [55]:
# 19.cod_prov 省份编码
# 转成int
# Nan = 0?
# number
df.cod_prov = df.cod_prov.astype('int8')
df.cod_prov.value_counts(dropna=False)
od_set.add(df.cod_prov.name)


 28    4409547
 8     1275219
 46     682304
 41     605164
 15     429322
 30     396759
 29     367023
 50     342543
 3      313397
 11     294684
 36     280026
 33     265749
 47     238259
 35     235683
 6      192996
 48     185888
 45     183067
 18     178562
 37     164238
 39     155706
 14     144679
 10     129912
 7      124933
 21     122283
 13     119390
 2      114128
 12     104295
 43     100115
 9       97188
 17      90538
 31      88618
 26      85202
 27      84962
 32      84009
 24      83003
 25      79059
 20      71567
 38      70968
-1       65856
 19      64618
 23      64363
 4       60291
 16      57310
 49      50927
 34      49282
 40      42341
 22      40181
 5       38783
 1       37704
 44      22525
 42      17660
 52       9460
 51       7218
Name: cod_prov, dtype: int64

In [56]:
# 20. nomprov 省份名称 删除
# df.nomprov.value_counts(dropna=False)

In [57]:
# 21. ind_actividad_cliente	活跃指数 1, active customer; 0, inactive customer)
df.ind_actividad_cliente = df.ind_actividad_cliente.astype('int8')
df.ind_actividad_cliente.value_counts(dropna=False)
od_set.add(df.ind_actividad_cliente.name)


0    7384379
1    6235125
Name: ind_actividad_cliente, dtype: int64

In [58]:
df.renta.max()

28894395.51

In [59]:
df.renta.min()

1202.73

In [60]:
df.renta = round(df.renta)
df.renta = df.renta.astype('int32')

In [61]:
# 23. segmento : 01 - VIP, 02 - 个人 03 - 大学毕业
# Nan = unknown
# ordinal 编码
df.segmento.value_counts()
od_set.add(df.segmento.name)


02 - PARTICULARES     7960150
03 - UNIVERSITARIO    4935579
01 - TOP               562142
00                     161633
Name: segmento, dtype: int64

In [334]:
data = df.iloc[:, :df.shape[1] - 24]
target = df.iloc[:, df.shape[1] - 24:]

In [335]:
data.shape
target.shape

(13619504, 17)

(13619504, 24)

In [336]:
set(data.columns)

{'age',
 'antiguedad',
 'canal_entrada',
 'cod_prov',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'renta',
 'segmento',
 'sexo',
 'tiprel_1mes'}

In [337]:
set(data.columns) - od_set

{'age', 'renta'}

In [338]:
# 编码流程
len(od_set)


15

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13619504 entries, 0 to 13647308
Data columns (total 41 columns):
 #   Column                 Dtype 
---  ------                 ----- 
 0   ind_empleado           object
 1   pais_residencia        object
 2   sexo                   object
 3   age                    int8  
 4   ind_nuevo              int8  
 5   antiguedad             object
 6   indrel                 int8  
 7   indrel_1mes            object
 8   tiprel_1mes            object
 9   indresi                object
 10  indext                 object
 11  canal_entrada          object
 12  indfall                object
 13  cod_prov               int8  
 14  ind_actividad_cliente  int8  
 15  renta                  int32 
 16  segmento               object
 17  ind_ahor_fin_ult1      bool  
 18  ind_aval_fin_ult1      bool  
 19  ind_cco_fin_ult1       bool  
 20  ind_cder_fin_ult1      bool  
 21  ind_cno_fin_ult1       bool  
 22  ind_ctju_fin_ult1      bool  
 23  ind_c

In [71]:
target = df.iloc[:, df.shape[1] - 24:]
target_names = list(target.columns)
target_names

['ind_ahor_fin_ult1',
 'ind_aval_fin_ult1',
 'ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1',
 'ind_nomina_ult1',
 'ind_nom_pens_ult1',
 'ind_recibo_ult1']

In [69]:
% % time
finished = []
for name in od_set:
    print(name)
    df = pd.concat([df, new_ordinal_df(df[name], label_enc(df[name]))], axis=1)
    finished.append(name)
df.drop(finished, inplace=True, axis=1)


indfall
antiguedad
cod_prov
tiprel_1mes
ind_actividad_cliente
indrel
segmento
canal_entrada
ind_empleado
ind_nuevo
indrel_1mes
sexo
indresi
indext
pais_residencia
CPU times: user 45.6 s, sys: 30.4 s, total: 1min 15s
Wall time: 1min 15s


In [70]:
df.shape
df.head()

(13619504, 41)

Unnamed: 0,age,renta,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,...,indrel_enc,segmento_enc,canal_entrada_enc,ind_empleado_enc,ind_nuevo_enc,indrel_1mes_enc,sexo_enc,indresi_enc,indext_enc,pais_residencia_enc
0,35,87218,False,False,True,False,False,False,False,False,...,0,2,153,3,0,0,0,1,0,0
1,23,35549,False,False,True,False,False,False,False,False,...,0,3,150,3,0,0,1,1,1,0
2,23,122179,False,False,True,False,False,False,False,False,...,0,3,150,3,0,0,1,1,0,0
3,22,119776,False,False,False,False,False,False,False,False,...,0,3,149,3,0,0,0,1,0,0
4,23,134254,False,False,True,False,False,False,False,False,...,0,3,150,3,0,0,1,1,0,0


In [72]:
X = df.drop(target_names, axis=1)

In [77]:
X

Unnamed: 0,age,renta,indfall_enc,antiguedad_enc,cod_prov_enc,tiprel_1mes_enc,ind_actividad_cliente_enc,indrel_enc,segmento_enc,canal_entrada_enc,ind_empleado_enc,ind_nuevo_enc,indrel_1mes_enc,sexo_enc,indresi_enc,indext_enc,pais_residencia_enc
0,35,87218,0,6,29,0,1,0,2,153,3,0,0,0,1,0,0
1,23,35549,0,35,13,1,0,0,3,150,3,0,0,1,1,1,0
2,23,122179,0,35,13,1,0,0,3,150,3,0,0,1,1,0,0
3,22,119776,0,35,50,1,0,0,3,149,3,0,0,0,1,0,0
4,23,134254,0,35,50,0,1,0,3,150,3,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13647304,22,43912,0,33,50,1,0,0,3,150,3,0,0,1,1,0,0
13647305,23,23335,0,33,26,1,0,0,3,150,3,0,0,1,1,0,0
13647306,47,134254,0,33,50,0,1,0,2,150,3,0,0,0,1,0,0
13647307,22,199593,0,33,50,1,0,0,3,150,3,0,0,0,1,0,0


In [78]:
target

Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13647304,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13647305,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13647306,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13647307,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# 模型

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

Xtrain, Xvalidation, Ytrain, Yvalidation = train_test_split(X, target, test_size=0.3)
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(Xtrain, Ytrain)
score_r = rfc.score(Xvalidation, Yvalidation)