# <h1 style='font-family:Lato,sans-serif;color:#ca5800;text-align:center;'>Machine Learning Model Training

<span style='font-family:Lato,sans-serif;'>author: <span style='color:#1696d2'>[@baiochi](#https://github.com/baiochi/)

<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.2em;'> Load libraries and functions

In [2]:
# Libraries, functions and custom configurations
from config.custom import *
# Load CSS for custom Markdown
HTML(open('./config/custom.css', 'r').read())

Numpy: 1.20.3
Pandas: 1.3.4
Sklearn: 1.0.2
Matplotlib: 3.4.3
Seaborn: 0.11.2


<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'>Analysis Setup

In [5]:
configuration = {
	# Setup file locations
	'df_path' : 		'./data/',
	'df_name' : 		'application_train.csv',
	'pickle_path' : 	'pickle_data/first_cycle/',
	# Select estimator
	'estimator' : 		XGBClassifier,
	# Select parameters
	'estimator_params' : {
						'learning_rate' : np.linspace(0.00001,0.1,100),
						'n_estimators' : range(500, 1000),
						'min_child_weight': [1, 5, 10],
						'gamma': [0.5, 1, 1.5, 2, 5],
						'subsample': [0.6, 0.8, 1.0],
						'colsample_bytree': [0.6, 0.8, 1.0],
						'max_depth': [3, 4, 5,6]
						},
	# Select size of dataset to split
	'data_size' : 		100,
	# Select metric of evaluation
	'metric' : 			'roc_auc',
	# Select type of Scoring ['classification', 'regression', 'clustering']
	'metric_type' : 	'classification',
	# Select number of hyperopt iterations
	'max_evals' : 		10
}


"""
RandomForestClassifier
{
						'criterion' : ['gini', 'entropy'],
						'n_estimators' : range(1, 501),
						'max_depth' : [1, 2]
						},
"""

"\nRandomForestClassifier\n{\n\t\t\t\t\t\t'criterion' : ['gini', 'entropy'],\n\t\t\t\t\t\t'n_estimators' : range(1, 501),\n\t\t\t\t\t\t'max_depth' : [1, 2]\n\t\t\t\t\t\t},\n"

In [6]:
# Assign new variables for a cleaner code
df_path = configuration['df_path']
df_name = configuration['df_name']
pickle_path = configuration['pickle_path']
estimator = configuration['estimator']
estimator_params = configuration['estimator_params']
data_size = configuration['data_size']
train_size, test_size = select_df_split_size(data_size)
metric = configuration['metric']
metric_type = configuration['metric_type']
max_evals = configuration['max_evals']




<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'> Load Data

In [7]:
df = pd.read_csv(df_path + df_name)
print(f'{CYAN}Dataset info{WHITE}\nObservations: {df.shape[0]}\nFeatures: {df.shape[1]}')
df.head()

[36mDataset info[39m
Observations: 246008
Features: 122


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,...,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,456162,0,Cash loans,F,N,N,0,112500.0,700830.0,22738.5,585000.0,Unaccompanied,Working,Incomplete higher,Single / not married,House / apartment,0.02,-8676,-813,-4163.0,-1363,,1,1,1,1,0,0,Core staff,1.0,2,2,FRIDAY,17,0,0,0,1,1,0,Trade: type 2,,0.699,0.171,0.062,0.03,0.976,0.674,0.005,0.0,...,0.062,0.03,0.976,0.678,0.005,0.0,0.103,0.167,0.042,0.0,0.051,0.052,0.0,0.0,reg oper account,block of flats,0.04,Block,No,0.0,0.0,0.0,0.0,-589.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,134978,0,Cash loans,F,N,N,0,90000.0,375322.5,14422.5,324000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.025,-13583,-223,-3554.0,-3287,,1,1,0,1,0,0,High skill tech staff,2.0,2,2,MONDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.541,0.2,0.769,0.023,0.057,0.981,0.735,0.016,0.0,...,0.023,0.057,0.981,0.738,0.016,0.0,0.103,0.042,0.083,0.013,0.019,0.019,0.0,0.0,reg oper account,block of flats,0.016,Block,No,0.0,0.0,0.0,0.0,-1409.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0
2,318952,0,Cash loans,M,Y,N,0,180000.0,544491.0,16047.0,454500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.036,-13993,-6202,-7971.0,-4175,9.0,1,1,1,1,0,0,Managers,2.0,2,2,THURSDAY,15,0,0,0,0,0,0,Business Entity Type 1,,0.705,0.626,,,,,,,...,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,-675.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,3.0
3,361264,0,Cash loans,F,N,Y,0,270000.0,814041.0,28971.0,679500.0,Unaccompanied,Pensioner,Secondary / secondary special,Married,House / apartment,0.046,-22425,365243,-11805.0,-1732,,1,0,0,1,1,0,,2.0,1,1,TUESDAY,9,0,0,0,0,0,0,XNA,,0.725,0.811,,,,,,,...,,,,,,,,,,,,,,,,,0.012,,No,2.0,0.0,2.0,0.0,-1588.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
4,260639,0,Cash loans,F,N,Y,0,144000.0,675000.0,21906.0,675000.0,Unaccompanied,Working,Secondary / secondary special,Separated,House / apartment,0.026,-18839,-2763,-5069.0,-2381,,1,1,0,1,1,0,Laborers,1.0,2,2,FRIDAY,16,0,0,0,0,0,0,Transport: type 4,0.592,0.706,0.331,0.191,0.18,0.989,0.85,0.034,0.0,...,0.193,0.18,0.989,0.852,0.035,0.0,0.448,0.167,0.208,0.28,0.158,0.21,0.0,0.0,reg oper account,block of flats,0.162,Panel,No,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,10.0,0.0,0.0


<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'> Define target, features and split data

In [8]:
X = df.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
													train_size=train_size, 
													test_size=test_size, 
													stratify=y, random_state=42)
numeric_features     = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=np.number).columns.tolist()
print(f'{CYAN}Train dataset size:{WHITE} {X_train.shape}')
print(f'{CYAN}Test dataset size:{WHITE} {X_test.shape}')

[36mTrain dataset size:[39m (196806, 120)
[36mTest dataset size:[39m (49202, 120)


<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'> Create Baseline Model

In [9]:
numeric_features_pipeline = Pipeline([
    ('impute_num', SimpleImputer(strategy='mean')),
    ('std', MinMaxScaler())
])
categorical_features_pipeline = Pipeline([
    ('impute_cat', SimpleImputer(strategy='constant', fill_value='unknow')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
pre_processing_pipeline = ColumnTransformer([
    ('numeric_transformer', numeric_features_pipeline, numeric_features),
    ('categorical_transformer', categorical_features_pipeline ,categorical_features)
])
pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('rf', estimator(random_state=42))
]).fit(X_train, y_train)

print(f'Score for baseline model with {estimator.__name__}')
display_metrics(X_train, X_test, y_train, y_test, pipeline, metric_type)

Score for baseline model with XGBClassifier
[36mTrain metrics:[39m
roc_auc_score 0.859
f1_score 0.149
[36mTest metrics:[39m
roc_auc_score 0.747
f1_score 0.061


<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'> Hyper-parameter Optimization

In [10]:
# create hyper-parameter space
hps_space = create_hps_space(hps_params=estimator_params)
# create objective function
objective_function = create_fmin_function(estimator=estimator, 
										pp_pipeline=pre_processing_pipeline,
										X_train=X_train, y_train=y_train,
										metric=metric,
										hps_space=hps_space)
# run hyper-parameter search
hps_search = fmin(fn=objective_function, 
					space=hps_space, 
					algo=tpe.suggest, 
					trials=Trials(), 
					rstate=np.random.default_rng(42),
					max_evals=max_evals)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]







 10%|█         | 1/10 [6:35:46<59:21:55, 23746.17s/trial, best loss: -0.7552526840812375]







 20%|██        | 2/10 [7:40:29<26:48:14, 12061.87s/trial, best loss: -0.7552526840812375]











 30%|███       | 3/10 [8:32:17<15:30:14, 7973.50s/trial, best loss: -0.7552526840812375] 











 40%|████      | 4/10 [8:59:00<9:05:51, 5458.66s/trial, best loss: -0.7566107822754908] 











 50%|█████     | 5/10 [9:35:40<5:56:57, 4283.53s/trial, best loss: -0.7566107822754908]











 60%|██████    | 6/10 [10:18:52<4:07:13, 3708.44s/trial, best loss: -0.7566107822754908]











 70%|███████   | 7/10 [11:20:49<3:05:33, 3711.02s/trial, best loss: -0.7566107822754908]







 80%|████████  | 8/10 [12:14:29<1:58:29, 3554.97s/trial, best loss: -0.7566107822754908]











 90%|█████████ | 9/10 [13:18:02<1:00:35, 3635.57s/trial, best loss: -0.7572181789172832]







100%|██████████| 10/10 [14:08:53<00:00, 5093.37s/trial, best loss: -0.7572181789172832] 


<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'> Metrics summary

In [11]:
# get results
hps_best_params = space_eval(hps_space, hps_search)
print(f'{CYAN}{estimator.__name__}{WHITE} best parameters:')
for key, value in hps_best_params.items():
        print(f'{GREEN}', key, f'{WHITE}', value)

# create new pipeline with the best parameters
hps_model = Pipeline([('pp', pre_processing_pipeline),
                ('est', estimator(**hps_best_params, random_state=42))]
        ).fit(X_train, y_train)

# print results
print(f'\nScore for best model with hyperopt:')
display_metrics(X_train, X_test, y_train, y_test, hps_model, metric_type)

[36mXGBClassifier[39m best parameters:
[32m colsample_bytree [39m 0.6
[32m gamma [39m 0.5
[32m learning_rate [39m 0.01718
[32m max_depth [39m 6
[32m min_child_weight [39m 5
[32m n_estimators [39m 861
[32m subsample [39m 0.6

Score for best model with hyperopt:
[36mTrain metrics:[39m
roc_auc_score 0.817
f1_score 0.066
[36mTest metrics:[39m
roc_auc_score 0.760
f1_score 0.039


<h3 style='font-family:Lato,sans-serif;color:#fdbf11;font-size: 1.5em;'> Saving model with Pickle

In [12]:
# calculate how much of data was used in the model training
if data_size < 10:
    data_size = '0' + str(data_size)

# save file
with open(pickle_path + f'{estimator.__name__}_{data_size}_pct.pickle', 'wb') as file:
    pickle.dump(hps_model, file, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'File saved on {os.path.realpath(file.name)}')

File saved on /Users/baiochi/OneDrive/GitHub/LetsCode/5_Machine_Learning/projeto/pickle_data/first_cycle/XGBClassifier_100_pct.pickle
