In [79]:
import pandas as pd
import numpy as np
from datetime import datetime, date

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from category_encoders import OneHotEncoder, TargetEncoder, CatBoostEncoder 

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import  confusion_matrix, precision_score,recall_score, f1_score, accuracy_score



In [39]:
data_train_demographics = pd.read_csv('traindemographics.csv')
data_train_perf = pd.read_csv('trainperf.csv')
data_train_prevloans = pd.read_csv('trainprevloans.csv')



### Dados demográficos

In [40]:
data_train_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4346 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4346 non-null   object 
 1   birthdate                   4346 non-null   object 
 2   bank_account_type           4346 non-null   object 
 3   longitude_gps               4346 non-null   float64
 4   latitude_gps                4346 non-null   float64
 5   bank_name_clients           4346 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3698 non-null   object 
 8   level_of_education_clients  587 non-null    object 
dtypes: float64(2), object(7)
memory usage: 305.7+ KB


Pelas informações dos dados podemos identificar que temos muitos dados nulos na feature bank_branch_clients

In [41]:
data_train_demographics.shape

(4346, 9)

In [42]:
data_train_demographics.nunique()


customerid                    4334
birthdate                     3297
bank_account_type                3
longitude_gps                 4103
latitude_gps                  4313
bank_name_clients               18
bank_branch_clients             45
employment_status_clients        6
level_of_education_clients       4
dtype: int64

In [43]:
#tratando dados duplicados
data_train_demographics = data_train_demographics.drop_duplicates(subset='customerid')
data_train_demographics.shape

(4334, 9)

In [44]:
data_train_demographics.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10 00:00:00.000000,Savings,3.319219,6.528604,GT Bank,,,
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21 00:00:00.000000,Savings,3.325598,7.119403,Sterling Bank,,Permanent,
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01 00:00:00.000000,Savings,5.7461,5.563174,Fidelity Bank,,,
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19 00:00:00.000000,Savings,3.36285,6.642485,GT Bank,,Permanent,
4,8a858e785acd3412015acd48f4920d04,1982-11-22 00:00:00.000000,Savings,8.455332,11.97141,GT Bank,,Permanent,


A coluna birthdate é um objeto no conjunto de dados, e essa feature por si só não informa muita coisa. Podemos transformar para o formato datetime e calcular a idade do cliente.

In [45]:
data_train_demographics['birthdate'] = pd.to_datetime(data_train_demographics['birthdate']).dt.normalize()
data_train_demographics.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,Savings,3.319219,6.528604,GT Bank,,,
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,Savings,3.325598,7.119403,Sterling Bank,,Permanent,
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,Savings,5.7461,5.563174,Fidelity Bank,,,
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,Savings,3.36285,6.642485,GT Bank,,Permanent,
4,8a858e785acd3412015acd48f4920d04,1982-11-22,Savings,8.455332,11.97141,GT Bank,,Permanent,


In [46]:
data_train_demographics.dtypes

customerid                            object
birthdate                     datetime64[ns]
bank_account_type                     object
longitude_gps                        float64
latitude_gps                         float64
bank_name_clients                     object
bank_branch_clients                   object
employment_status_clients             object
level_of_education_clients            object
dtype: object

In [47]:
data_train_demographics['Age'] = round((pd.to_datetime(datetime.today()) - data_train_demographics['birthdate']).dt.days /365)
data_train_demographics

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients,Age
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,Savings,3.319219,6.528604,GT Bank,,,,49.0
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,Savings,3.325598,7.119403,Sterling Bank,,Permanent,,36.0
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,Savings,5.746100,5.563174,Fidelity Bank,,,,35.0
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,Savings,3.362850,6.642485,GT Bank,,Permanent,,31.0
4,8a858e785acd3412015acd48f4920d04,1982-11-22,Savings,8.455332,11.971410,GT Bank,,Permanent,,40.0
...,...,...,...,...,...,...,...,...,...,...
4341,8a858f155554552501555588ca2b3b40,1985-12-13,Other,3.236753,7.030168,Stanbic IBTC,,Permanent,Graduate,37.0
4342,8a858fc65cf978f4015cf97cee3a02ce,1982-07-01,Savings,7.013750,4.875662,GT Bank,,,,40.0
4343,8a858f4f5b66de3a015b66fc83c61902,1989-09-26,Savings,6.295530,7.092508,GT Bank,,Permanent,,33.0
4344,8aaae7a74400b28201441c8b62514150,1985-09-06,Savings,3.354206,6.539070,GT Bank,HEAD OFFICE,Permanent,Primary,37.0


In [48]:
data_train_demographics.columns.values

array(['customerid', 'birthdate', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'bank_branch_clients',
       'employment_status_clients', 'level_of_education_clients', 'Age'],
      dtype=object)

In [49]:
data_train_demographics.isna().sum()

customerid                       0
birthdate                        0
bank_account_type                0
longitude_gps                    0
latitude_gps                     0
bank_name_clients                0
bank_branch_clients           4283
employment_status_clients      648
level_of_education_clients    3748
Age                              0
dtype: int64

In [50]:
data_train_demographics = data_train_demographics.drop(['birthdate','bank_branch_clients','level_of_education_clients' ], axis=1)

In [51]:
data_train_demographics.head()

Unnamed: 0,customerid,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,Age
0,8a858e135cb22031015cbafc76964ebd,Savings,3.319219,6.528604,GT Bank,,49.0
1,8a858e275c7ea5ec015c82482d7c3996,Savings,3.325598,7.119403,Sterling Bank,Permanent,36.0
2,8a858e5b5bd99460015bdc95cd485634,Savings,5.7461,5.563174,Fidelity Bank,,35.0
3,8a858efd5ca70688015cabd1f1e94b55,Savings,3.36285,6.642485,GT Bank,Permanent,31.0
4,8a858e785acd3412015acd48f4920d04,Savings,8.455332,11.97141,GT Bank,Permanent,40.0


In [52]:
data_train_demographics.describe()

Unnamed: 0,longitude_gps,latitude_gps,Age
count,4334.0,4334.0,4334.0
mean,4.624,7.249975,37.97185
std,7.194031,3.056594,6.141808
min,-118.247009,-33.868818,26.0
25%,3.354857,6.47061,34.0
50%,3.591668,6.62173,37.0
75%,6.543691,7.425021,42.0
max,151.20929,71.228069,61.0


In [53]:
data_train_demographics.describe(include='O')

Unnamed: 0,customerid,bank_account_type,bank_name_clients,employment_status_clients
count,4334,4334,4334,3686
unique,4334,3,18,6
top,8a858e135cb22031015cbafc76964ebd,Savings,GT Bank,Permanent
freq,1,3414,1594,3134


### Empréstimos anteriores

In [54]:
data_train_prevloans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18183 entries, 0 to 18182
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customerid       18183 non-null  object 
 1   systemloanid     18183 non-null  int64  
 2   loannumber       18183 non-null  int64  
 3   approveddate     18183 non-null  object 
 4   creationdate     18183 non-null  object 
 5   loanamount       18183 non-null  float64
 6   totaldue         18183 non-null  float64
 7   termdays         18183 non-null  int64  
 8   closeddate       18183 non-null  object 
 9   referredby       1026 non-null   object 
 10  firstduedate     18183 non-null  object 
 11  firstrepaiddate  18183 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 1.7+ MB


In [55]:
data_train_prevloans.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15 18:22:40.000000,2016-08-15 17:22:32.000000,10000.0,13000.0,30,2016-09-01 16:06:48.000000,,2016-09-14 00:00:00.000000,2016-09-01 15:51:43.000000
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28 18:39:07.000000,2017-04-28 17:38:53.000000,10000.0,13000.0,30,2017-05-28 14:44:49.000000,,2017-05-30 00:00:00.000000,2017-05-26 00:00:00.000000
2,8a2a81a74ce8c05d014cfb32a0da1049,301831714,8,2017-03-05 10:56:25.000000,2017-03-05 09:56:19.000000,20000.0,23800.0,30,2017-04-26 22:18:56.000000,,2017-04-04 00:00:00.000000,2017-04-26 22:03:47.000000
3,8a8588f35438fe12015444567666018e,301861541,5,2017-04-09 18:25:55.000000,2017-04-09 17:25:42.000000,10000.0,11500.0,15,2017-04-24 01:35:52.000000,,2017-04-24 00:00:00.000000,2017-04-24 00:48:43.000000
4,8a85890754145ace015429211b513e16,301941754,2,2017-06-17 09:29:57.000000,2017-06-17 08:29:50.000000,10000.0,11500.0,15,2017-07-14 21:18:43.000000,,2017-07-03 00:00:00.000000,2017-07-14 21:08:35.000000


https://datascientyst.com/reverse-geocoding-latitude-longitude-city-country-python-pandas/

In [56]:
date_columns = ['approveddate',
       'creationdate', 'closeddate', 'firstduedate', 'firstrepaiddate']

In [57]:
data_train_prevloans[date_columns] = data_train_prevloans[date_columns].applymap(lambda x: pd.to_datetime(x).normalize())
data_train_prevloans.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15,2016-08-15,10000.0,13000.0,30,2016-09-01,,2016-09-14,2016-09-01
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28,2017-04-28,10000.0,13000.0,30,2017-05-28,,2017-05-30,2017-05-26
2,8a2a81a74ce8c05d014cfb32a0da1049,301831714,8,2017-03-05,2017-03-05,20000.0,23800.0,30,2017-04-26,,2017-04-04,2017-04-26
3,8a8588f35438fe12015444567666018e,301861541,5,2017-04-09,2017-04-09,10000.0,11500.0,15,2017-04-24,,2017-04-24,2017-04-24
4,8a85890754145ace015429211b513e16,301941754,2,2017-06-17,2017-06-17,10000.0,11500.0,15,2017-07-14,,2017-07-03,2017-07-14


In [58]:
#DataFrame column is a Series, and for Series you need dt.accessor to calculate days
#criando uma coluna com a quantidade de dias que o cliente levou para quitar o empréstimo desde sua aprovação.
data_train_prevloans['days_to_close_loan'] =  (data_train_prevloans['closeddate'] - data_train_prevloans['approveddate']).dt.days


In [59]:
data_train_prevloans.head(10)

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate,days_to_close_loan
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15,2016-08-15,10000.0,13000.0,30,2016-09-01,,2016-09-14,2016-09-01,17
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28,2017-04-28,10000.0,13000.0,30,2017-05-28,,2017-05-30,2017-05-26,30
2,8a2a81a74ce8c05d014cfb32a0da1049,301831714,8,2017-03-05,2017-03-05,20000.0,23800.0,30,2017-04-26,,2017-04-04,2017-04-26,52
3,8a8588f35438fe12015444567666018e,301861541,5,2017-04-09,2017-04-09,10000.0,11500.0,15,2017-04-24,,2017-04-24,2017-04-24,15
4,8a85890754145ace015429211b513e16,301941754,2,2017-06-17,2017-06-17,10000.0,11500.0,15,2017-07-14,,2017-07-03,2017-07-14,27
5,8a858970548359cc0154883481981866,301832407,5,2017-03-06,2017-03-06,20000.0,23800.0,30,2017-04-04,,2017-04-05,2017-04-04,29
6,8a858970548359cc0154883481981866,301926513,8,2017-06-04,2017-06-04,30000.0,39000.0,60,2017-07-03,,2017-07-04,2017-07-03,29
7,8a8589f35451855401546b0738c42524,301912443,6,2017-05-18,2017-05-18,20000.0,24500.0,30,2017-06-19,,2017-06-19,2017-06-19,32
8,8a858e095c59b91b015c5e5cea3719bc,301936857,2,2017-06-13,2017-06-13,10000.0,13000.0,30,2017-07-10,,2017-07-13,2017-07-10,27
9,8a858e1158dc4d830158f7bde4f47ea7,301811291,3,2017-02-06,2017-02-06,10000.0,11500.0,15,2017-02-21,,2017-02-21,2017-02-21,15


In [60]:
data_train_prevloans['Late payment'] = np.where(data_train_prevloans['days_to_close_loan']>data_train_prevloans['termdays'], 1,0 )

In [61]:
data_train_prevloans.columns.values

array(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate',
       'referredby', 'firstduedate', 'firstrepaiddate',
       'days_to_close_loan', 'Late payment'], dtype=object)

In [62]:
data_train_prevloans = data_train_prevloans.drop(['approveddate',
       'creationdate', 'closeddate',
       'referredby', 'firstduedate', 'firstrepaiddate'], axis=1)

In [63]:
data_train_prevloans.head()

Unnamed: 0,customerid,systemloanid,loannumber,loanamount,totaldue,termdays,days_to_close_loan,Late payment
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,10000.0,13000.0,30,17,0
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,10000.0,13000.0,30,30,0
2,8a2a81a74ce8c05d014cfb32a0da1049,301831714,8,20000.0,23800.0,30,52,1
3,8a8588f35438fe12015444567666018e,301861541,5,10000.0,11500.0,15,15,0
4,8a85890754145ace015429211b513e16,301941754,2,10000.0,11500.0,15,27,1


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.normalize.html

In [64]:
merge_demo_perf = pd.merge(data_train_perf, data_train_demographics, how='left', on='customerid')
merge_demo_perf.head()


Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,Age
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,Other,3.43201,6.433055,Diamond Bank,Permanent,51.0
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,15000.0,17250.0,30,,Good,Savings,3.885298,7.3207,GT Bank,Permanent,37.0
2,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,20000.0,22250.0,15,,Good,Other,11.13935,10.292041,EcoBank,Permanent,38.0
3,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,10000.0,11500.0,15,,Good,Savings,3.98577,7.491708,First Bank,Permanent,45.0
4,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,40000.0,44000.0,30,,Good,Other,7.457913,9.076574,GT Bank,Permanent,36.0


In [65]:
dataset = pd.merge(merge_demo_perf, data_train_prevloans, how='left', on='customerid')
dataset.head()

Unnamed: 0,customerid,systemloanid_x,loannumber_x,approveddate,creationdate,loanamount_x,totaldue_x,termdays_x,referredby,good_bad_flag,...,bank_name_clients,employment_status_clients,Age,systemloanid_y,loannumber_y,loanamount_y,totaldue_y,termdays_y,days_to_close_loan,Late payment
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,...,Diamond Bank,Permanent,51.0,301682320.0,2.0,10000.0,13000.0,30.0,17.0,0.0
1,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,...,Diamond Bank,Permanent,51.0,301883808.0,9.0,10000.0,13000.0,30.0,30.0,0.0
2,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,...,Diamond Bank,Permanent,51.0,301831714.0,8.0,20000.0,23800.0,30.0,52.0,1.0
3,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,...,Diamond Bank,Permanent,51.0,301923941.0,10.0,20000.0,24500.0,30.0,24.0,0.0
4,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,...,Diamond Bank,Permanent,51.0,301954468.0,11.0,20000.0,24500.0,30.0,27.0,0.0


In [66]:
dataset.columns

Index(['customerid', 'systemloanid_x', 'loannumber_x', 'approveddate',
       'creationdate', 'loanamount_x', 'totaldue_x', 'termdays_x',
       'referredby', 'good_bad_flag', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'employment_status_clients', 'Age',
       'systemloanid_y', 'loannumber_y', 'loanamount_y', 'totaldue_y',
       'termdays_y', 'days_to_close_loan', 'Late payment'],
      dtype='object')

In [67]:
dataset = dataset.drop(['customerid', 'loannumber_x','loanamount_x','totaldue_x', 'termdays_x','systemloanid_x','approveddate',
       'creationdate','referredby','systemloanid_y','longitude_gps',
       'latitude_gps'],axis=1)

In [68]:
dataset['good_bad_flag'] = np.where(dataset['good_bad_flag'] == 'Good', 1 ,0)

In [69]:
dataset.head()

Unnamed: 0,good_bad_flag,bank_account_type,bank_name_clients,employment_status_clients,Age,loannumber_y,loanamount_y,totaldue_y,termdays_y,days_to_close_loan,Late payment
0,1,Other,Diamond Bank,Permanent,51.0,2.0,10000.0,13000.0,30.0,17.0,0.0
1,1,Other,Diamond Bank,Permanent,51.0,9.0,10000.0,13000.0,30.0,30.0,0.0
2,1,Other,Diamond Bank,Permanent,51.0,8.0,20000.0,23800.0,30.0,52.0,1.0
3,1,Other,Diamond Bank,Permanent,51.0,10.0,20000.0,24500.0,30.0,24.0,0.0
4,1,Other,Diamond Bank,Permanent,51.0,11.0,20000.0,24500.0,30.0,27.0,0.0


In [70]:
dataset.columns.values

array(['good_bad_flag', 'bank_account_type', 'bank_name_clients',
       'employment_status_clients', 'Age', 'loannumber_y', 'loanamount_y',
       'totaldue_y', 'termdays_y', 'days_to_close_loan', 'Late payment'],
      dtype=object)

In [71]:
X = dataset.drop(['good_bad_flag'], axis=1)
y = dataset['good_bad_flag']

In [72]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### Pipeline

In [73]:
numeric_var = ['Age', 'loannumber_y', 'loanamount_y',
       'totaldue_y', 'termdays_y', 'days_to_close_loan', 'Late payment']

categorical_var = ['bank_account_type', 'bank_name_clients',
       'employment_status_clients']

In [74]:
numeric_impute = [ 
    'median', 
    'mean'
]

categorical_impute = [ 
    'most_frequent', 
    'constant'
]

feature_scaling = [
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler()
]

feature_encoding = [
    CatBoostEncoder(),
    TargetEncoder(),
    OneHotEncoder()
]

models = [
   
    LogisticRegression(),
    DecisionTreeClassifier(max_depth=3, random_state=0),
    RandomForestClassifier(max_depth=3, random_state=0),
]

appended_data = []

for numeric in numeric_impute:
  for categorical in categorical_impute:
    for scaler in feature_scaling:
      for encoder in feature_encoding:
        for model in models:
          numeric_transformer = Pipeline(
              steps=[('imputer', SimpleImputer(strategy=numeric)), ('scaler', scaler)]
          )

          categorical_transformer = Pipeline(
              steps=[("imputer", SimpleImputer(strategy=categorical)), ('encoder', encoder)]
          )

          preprocessor = ColumnTransformer(
              transformers=[
                  ("num", numeric_transformer, numeric_var),
                  ("cat", categorical_transformer, categorical_var)
              ]
          )
          pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])  
          pipe.fit(X_train, y_train)
          y_pred = pipe.predict(X_test)
          score = f1_score(y_test, y_pred)
          list = [numeric,categorical,scaler,encoder,model,score]
          appended_data.append(list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [75]:
columns = ['numeric_imputer','category_imputer','scaler','encoder','classifier','score']

df_results = pd.DataFrame(appended_data, columns = columns)
df_results.sort_values(by=['score'], ascending=False).head()

Unnamed: 0,numeric_imputer,category_imputer,scaler,encoder,classifier,score
69,mean,most_frequent,MinMaxScaler(),OneHotEncoder(),LogisticRegression(),0.895147
15,median,most_frequent,MinMaxScaler(),OneHotEncoder(),LogisticRegression(),0.895147
39,median,constant,MinMaxScaler(),TargetEncoder(),LogisticRegression(),0.895035
93,mean,constant,MinMaxScaler(),TargetEncoder(),LogisticRegression(),0.895035
96,mean,constant,MinMaxScaler(),OneHotEncoder(),LogisticRegression(),0.894972


In [78]:
numeric_transformer = Pipeline(
  steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())]
)

categorical_transformer = Pipeline(
  steps=[("imputer", SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder())]
)

preprocessor = ColumnTransformer(
  transformers=[
  ("num", numeric_transformer, numeric_var),
  ("cat", categorical_transformer, categorical_var)
  ]
)

k_vs_score = []
for k in range(1,10):
  selector = SelectKBest(score_func=f_classif, k = k)
  clf =  LogisticRegression()	
  pipe = Pipeline(steps=[("preprocessor", preprocessor), ('selector', selector), ('clf', clf)])  

  pipe.fit(X_train, y_train)
  y_pred = pipe.predict(X_test)
  score = f1_score(y_test, y_pred)
  print('k =  {} - Score = {}'.format(k,score))
  k_vs_score.append(score)

k =  1 - Score = 0.8948061152171712
k =  2 - Score = 0.8948061152171712
k =  3 - Score = 0.8948061152171712
k =  4 - Score = 0.8948061152171712
k =  5 - Score = 0.8948061152171712
k =  6 - Score = 0.8948967193195625
k =  7 - Score = 0.8947848101265822
k =  8 - Score = 0.8949873417721519
k =  9 - Score = 0.8948967193195625


In [80]:
numeric_transformer = Pipeline(
  steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())]
)

categorical_transformer = Pipeline(
  steps=[("imputer", SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder())]
)

preprocessor = ColumnTransformer(
  transformers=[
  ("num", numeric_transformer, numeric_var),
  ("cat", categorical_transformer, categorical_var)
  ]
)

selector = SelectKBest(score_func=f_classif, k = k)
model = LogisticRegression()

pipe = Pipeline(steps=[("preprocessor", preprocessor), ('selector', selector), ('model', model)])  
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(f'Accuracy: {round(accuracy_score(y_test, y_pred)*100, 3)}%')
print(f'F1 Score: {round(f1_score(y_test,y_pred)*100, 3)}%')
print(f'Recall Score: {round(recall_score(y_test, y_pred)*100, 3)}%')
print(f'Precision Score : {round(precision_score(y_test, y_pred)*100,3 )}%')


Accuracy: 80.982%
F1 Score: 89.49%
Recall Score: 100.0%
Precision Score : 80.979%
