In [4]:
import pandas as pd
import numpy as np
import joblib 

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans


import xgboost as xgb
import re


In [5]:
pip list

Package                            Version
---------------------------------- -----------
absl-py                            2.0.0
aiobotocore                        2.7.0
aiohttp                            3.9.0
aioitertools                       0.7.1
aiosignal                          1.2.0
alabaster                          0.7.12
anaconda-anon-usage                0.4.2
anaconda-catalogs                  0.2.0
anaconda-client                    1.12.1
anaconda-cloud-auth                0.1.4
anaconda-navigator                 2.5.0
anaconda-project                   0.11.1
annotated-types                    0.6.0
anyio                              3.5.0
appdirs                            1.4.4
applaunchservices                  0.3.0
appnope                            0.1.2
appscript                          1.1.2
archspec                           0.2.1
argon2-cffi                        21.3.0
argon2-cffi-bindings               21.2.0
arrow                              1.2.3
ast

In [6]:
#downgrade to python 3.11 to run this 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import catboost 
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor



In [7]:
#binary classification
bc_adults = pd.read_csv('binary classification/datasets/adults/adults_train.csv')
X_bc_adults = bc_adults.iloc[:,:-1]
y_bc_adults = bc_adults.iloc[:,-1]

bc_german = pd.read_csv('binary classification/datasets/german credit/german_credit_train.csv')
X_bc_german = bc_german.iloc[:,:-1]
y_bc_german = bc_german.iloc[:,-1]


In [9]:
#regression
reg = pd.read_csv('regression/datasets/insurance/insurance_train.csv')
X_reg = reg.iloc[:,:-1]
y_reg = reg.iloc[:,-1]

In [10]:
#multiclass classification
mc = pd.read_csv('multiclass classification/datasets/healthcare/healthcare_train.csv')
X_mc = mc.iloc[:,:-1]
y_mc = mc.iloc[:,-1]

In [11]:
#clustering
clust = pd.read_csv('clustering/datasets/customers/customers.csv')
X_clust = clust

In [12]:
#Initialize and train XGBoost for binary classification, sklearn wrapper and booster
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(X_bc_adults, y_bc_adults)
joblib.dump(model, 'binary classification/models/xgboost-2_0_2_sklearn-binary_classification_adults.joblib')

xgb_params = {
    'objective': 'binary:logistic',  
    'eval_metric': 'logloss',        
    'max_depth': 3,
    'learning_rate': 0.1,
    'random_state': 42
}
dtrain = xgb.DMatrix(X_bc_adults, label=y_bc_adults)
xgb_booster = xgb.train(xgb_params, dtrain)
joblib.dump(xgb_booster, 'binary classification/models/xgboost-2_0_2-binary_classification_adults.joblib')


['binary classification/models/xgboost-2_0_2-binary_classification_adults.joblib']

In [13]:
#Initialize and train XGBoost for binary classification, sklearn wrapper and booster
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(X_bc_german, y_bc_german)
joblib.dump(model, 'binary classification/models/xgboost-2_0_2_sklearn-binary_classification_german.joblib')

xgb_params = {
    'objective': 'binary:logistic',  
    'eval_metric': 'logloss',        
    'max_depth': 3,
    'learning_rate': 0.1,
    'random_state': 42
}
dtrain = xgb.DMatrix(X_bc_german, label=y_bc_german)
xgb_booster = xgb.train(xgb_params, dtrain)
joblib.dump(xgb_booster, 'binary classification/models/xgboost-2_0_2-binary_classification_german.joblib')


['binary classification/models/xgboost-2_0_2-binary_classification_german.joblib']

In [15]:
#Initialize and train XGBoost for regression, sklearn wrapper and booster
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_reg, y_reg)
joblib.dump(model, 'regression/models/xgboost-2_0_2_sklearn-regression_insurance.joblib')

xgb_params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',             
    'max_depth': 3,
    'learning_rate': 0.1,
    'seed': 42
}
dtrain = xgb.DMatrix(X_reg, label=y_reg)
xgb_booster = xgb.train(xgb_params, dtrain, 100)
joblib.dump(xgb_booster, 'regression/models/xgboost-2_0_2-regression_insurance.joblib')



['regression/models/xgboost-2_0_2-regression_insurance.joblib']

In [16]:
# Initialize and train XGBoost for multiclass classification, sklearn wrapper and booster
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_mc)), eval_metric='mlogloss')
model.fit(X_mc, y_mc)
joblib.dump(model, 'multiclass classification/models/xgboost-2_0_2_sklearn-multiclass_classification_healthcare.joblib')

xgb_params = {
    'objective': 'multi:softmax',  
    'num_class': len(set(y_mc)),                 
    'eval_metric': 'merror',        
    'max_depth': 3,
    'learning_rate': 0.1,
    'seed': 42
}
dtrain = xgb.DMatrix(X_mc, label=y_mc)
xgb_booster = xgb.train(xgb_params, dtrain, 100)
joblib.dump(xgb_booster, 'multiclass classification/models/xgboost-2_0_2-multiclass_classification_healthcare.joblib')


['multiclass classification/models/xgboost-2_0_2-multiclass_classification_healthcare.joblib']

In [17]:
# Initialize and train tensorflow model for binary classification
model = Sequential()
model.add(Dense(32, input_dim=X_bc_adults.shape[1], activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_bc_adults, y_bc_adults, epochs=10, batch_size=32)
#joblib.dump(model, 'models/tensorflow-2_15_0-binary_classification.joblib')
model.save('binary classification/models/tensorflow-2_15_0-binary_classification_adults.h5')
model.save('binary classification/models/tensorflow-2_15_0-binary_classification_adults.keras')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [18]:
# Initialize and train tensorflow model for binary classification
model = Sequential()
model.add(Dense(32, input_dim=X_bc_german.shape[1], activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_bc_german, y_bc_german, epochs=10, batch_size=32)
#joblib.dump(model, 'models/tensorflow-2_15_0-binary_classification.joblib')
model.save('binary classification/models/tensorflow-2_15_0-binary_classification_german.h5')
model.save('binary classification/models/tensorflow-2_15_0-binary_classification_german.keras')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [19]:
# Initialize and train tensorflow model for regression
model = Sequential()
model.add(Dense(32, input_dim=X_reg.shape[1], activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_reg, y_reg, epochs=10, batch_size=32)
#joblib.dump(model, 'models/tensorflow-2_15_0-regression.joblib')
model.save('regression/models/tensorflow-2_15_0-regression_insurance.h5')
model.save('regression/models/tensorflow-2_15_0-regression_insurance.keras')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [20]:
# Initialize and train tensorflow model for multiclass classification 
model = Sequential()
model.add(Dense(32, input_dim=X_mc.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))  # Number of classes is set to 3
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_mc, y_mc, epochs=10, batch_size=32)
#joblib.dump(model, 'models/tensorflow-2_15_0-multiclass_classification.joblib')
model.save('multiclass classification/models/tensorflow-2_15_0-multiclass_classification_healthcare.h5')
model.save('multiclass classification/models/tensorflow-2_15_0-multiclass_classification_healthcare.keras')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [21]:
# Initialize and train catboost model for binary classification, sklearn and booster
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='Logloss')
model.fit(X_bc_adults, y_bc_adults, early_stopping_rounds=50, verbose=100)
joblib.dump(model, 'binary classification/models/catboost-1_2_2_sklearn-binary_classification_adults.joblib')

train_pool = catboost.Pool(X_bc_adults, label=y_bc_adults)
catboost_params = {
    'iterations': 100,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',  
    'verbose': 0
}
catboost_model = CatBoostClassifier(**catboost_params)
catboost_model.fit(train_pool)
joblib.dump(catboost_model, 'binary classification/models/catboost-1_2_2-binary_classification_adults.joblib')



0:	learn: 0.6482069	total: 73.4ms	remaining: 36.6s
100:	learn: 0.3314940	total: 859ms	remaining: 3.39s
200:	learn: 0.3221972	total: 1.6s	remaining: 2.38s
300:	learn: 0.3143325	total: 2.44s	remaining: 1.61s
400:	learn: 0.3079725	total: 3.29s	remaining: 813ms
499:	learn: 0.3038298	total: 4.14s	remaining: 0us


['binary classification/models/catboost-1_2_2-binary_classification_adults.joblib']

In [22]:
# Initialize and train catboost model for binary classification, sklearn and booster
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='Logloss')
model.fit(X_bc_german, y_bc_german, early_stopping_rounds=50, verbose=100)
joblib.dump(model, 'binary classification/models/catboost-1_2_2_sklearn-binary_classification_german.joblib')

train_pool = catboost.Pool(X_bc_german, label=y_bc_german)
catboost_params = {
    'iterations': 100,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',  
    'verbose': 0
}
catboost_model = CatBoostClassifier(**catboost_params)
catboost_model.fit(train_pool)
joblib.dump(catboost_model, 'binary classification/models/catboost-1_2_2-binary_classification_german.joblib')


0:	learn: 0.6680273	total: 15ms	remaining: 7.48s
100:	learn: 0.1287385	total: 939ms	remaining: 3.71s
200:	learn: 0.0516764	total: 1.89s	remaining: 2.81s
300:	learn: 0.0282615	total: 2.84s	remaining: 1.88s
400:	learn: 0.0186566	total: 3.78s	remaining: 934ms
499:	learn: 0.0136905	total: 4.72s	remaining: 0us


['binary classification/models/catboost-1_2_2-binary_classification_german.joblib']

In [23]:
# Initialize and train catboost model for regression, sklearn and booster
model = CatBoostRegressor(iterations=500, depth=10, learning_rate=0.05, loss_function='RMSE')
model.fit(X_reg, y_reg, early_stopping_rounds=50, verbose=100)
joblib.dump(model, 'regression/models/catboost-1_2_2_sklearn-regression_insurance.joblib')

train_pool = catboost.Pool(X_reg, label=y_reg)
catboost_params = {
    'iterations': 100,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'RMSE',  
    'verbose': 0
}
catboost_model = CatBoostRegressor(**catboost_params)
catboost_model.fit(train_pool)
joblib.dump(catboost_model, 'regression/models/catboost-1_2_2-regression_insurance.joblib')



0:	learn: 11593.9832656	total: 4.14ms	remaining: 2.07s
100:	learn: 3574.9122511	total: 150ms	remaining: 592ms
200:	learn: 2789.0024317	total: 331ms	remaining: 493ms
300:	learn: 2323.7274404	total: 528ms	remaining: 349ms
400:	learn: 1962.7708051	total: 726ms	remaining: 179ms
499:	learn: 1700.3736051	total: 918ms	remaining: 0us


['regression/models/catboost-1_2_2-regression_insurance.joblib']

In [24]:
# Initialize and train catboost model for multiclass classification, sklearn and booster
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='MultiClass')
model.fit(X_mc, y_mc, early_stopping_rounds=50, verbose=100)
joblib.dump(model, 'multiclass classification/models/catboost-1_2_2_sklearn-multiclass_classification_healthcare.joblib')

train_pool = catboost.Pool(X_mc, label=y_mc)
catboost_params = {
    'iterations': 100,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'MultiClass',  
    'verbose': 0
}
catboost_model = CatBoostClassifier(**catboost_params)
catboost_model.fit(train_pool)
joblib.dump(catboost_model, 'multiclass classification/models/catboost-1_2_2-multiclass_classification_healthcare.joblib')

0:	learn: 1.0947772	total: 13.9ms	remaining: 6.92s
100:	learn: 0.8642666	total: 767ms	remaining: 3.03s
200:	learn: 0.7124993	total: 1.53s	remaining: 2.27s
300:	learn: 0.6120539	total: 2.29s	remaining: 1.51s
400:	learn: 0.5325977	total: 3.05s	remaining: 752ms
499:	learn: 0.4668946	total: 3.8s	remaining: 0us


['multiclass classification/models/catboost-1_2_2-multiclass_classification_healthcare.joblib']