In [1]:
import pandas as pd 
from sklearn import set_config
set_config(display="diagram")


from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline 

from sklearn_pandas import DataFrameMapper
from feature_engine.imputation import DropMissingData


import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import numpy as np 

import joblib 

In [5]:
df = pd.read_csv("../data/dataset_processed/banking_processed.csv", sep=',')

In [6]:
df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,...,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,poutcome_failure,poutcome_nonexistent,poutcome_success,y
0,0.381527,-0.18623,-0.565922,0.195414,-0.349494,0.839061,-0.227465,0.951267,0.773575,0.84517,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,1.245157,-0.463926,-0.565922,0.195414,-0.349494,-0.115781,-0.649003,-0.323542,0.230456,0.398115,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
2,-1.153816,0.311309,0.156105,-5.117342,3.691766,-1.134279,0.828107,0.15181,-1.667578,-2.428157,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1
3,-0.098268,-0.282652,-0.204909,0.195414,-0.349494,-1.197935,-0.864955,-1.425496,-1.277824,-0.940281,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,1.437075,-0.467783,-0.565922,-5.133393,1.671136,-1.898153,-2.374889,1.966794,-1.586859,-1.257233,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1


In [7]:
X = df.drop(['y'], axis=1)
y = df['y']

In [11]:
y.shape

(41188,)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [14]:
from sklearn.ensemble import RandomForestClassifier

clas = RandomForestClassifier()

In [16]:
clas.fit(X_train, y_train)

TypeError: fit() got an unexpected keyword argument 'verbose'

In [18]:
y_pred = clas.predict(X_test)

In [19]:
y_pred

array([0, 0, 0, ..., 0, 1, 0])

In [20]:
def predict_prob_on_test_data(model,X_test):
    y_pred = model.predict_proba(X_test)
    return y_pred

pred_prob = predict_prob_on_test_data(clas, X_test)

In [22]:
def get_metrics(y_true, y_pred, y_pred_prob):
    from sklearn.metrics import accuracy_score,precision_score,recall_score,log_loss
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    entropy = log_loss(y_true, y_pred_prob)
    return {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2), 'entropy': round(entropy, 2)}

In [23]:
get_metrics(y_test, y_pred, pred_prob)

{'accuracy': 0.91, 'precision': 0.64, 'recall': 0.49, 'entropy': 0.2}

In [None]:
class CabinFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the CabinFeatureTransformer init method: ')
        
    def fit(self, x, y=None):
        x.Cabin.fillna('U', inplace=True)
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')    
        self.cabin_columns=  cabin_dummies.columns
        return self

    def transform(self, x):
        # replacing missing cabins with U (for Uknown)
        x.Cabin.fillna('U', inplace=True)
    
        # mapping each Cabin value with the cabin letter
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin') 
        cabin_dummies = cabin_dummies.reindex(columns = self.cabin_columns, fill_value=0)
        
        x = pd.concat([x, cabin_dummies], axis=1)

        x.drop('Cabin', axis=1, inplace=True)
    
        return x

In [None]:
DataFrameMapper