In [1]:
import sys
print(sys.executable)

c:\Users\Ray Joshi\Desktop\MLFlow\venv\python.exe


In [2]:
!python --version

Python 3.9.16


### Functions for all the steps involved in complete model training lifecycle

In [3]:
import pandas as pd
import numpy as np

In [4]:
def load_data(path):
    data = pd.read_csv(path)
    return data


In [5]:
data = load_data('https://raw.githubusercontent.com/TripathiAshutosh/dataset/main/banking.csv')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [6]:
def data_cleaning(data):
    print("na values available in data \n")
    print(data.isna().sum())
    data = data.dropna()
    print("after dropping na values \n")
    print(data.isna().sum())
    return data

In [7]:
def preprocessing(data):
    data['education'] = np.where(data['education'] == 'basic.9y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.6y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.4y', 'Basic', data['education'])

    cat_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    for var in cat_vars:
        cat_list = 'var' + '_' + var
        cat_list = pd.get_dummies(data[var], prefix = var)
        data1 = data.join(cat_list)
        data = data1
    
    cat_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    data_vars = data.columns.values.tolist()
    to_keep = [i for i in data_vars if i not in cat_vars]

    final_data = data[to_keep]

    final_data.columns = final_data.columns.str.replace('.', '_')
    final_data.columns = final_data.columns.str.replace(' ', '_')
    return final_data

In [8]:
def train_test_split(final_data):
    from sklearn.model_selection import train_test_split
    X = final_data.loc[:, final_data.columns != 'y']
    y = final_data.loc[:, final_data.columns == 'y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)
    return X_train, X_test, y_train, y_test

In [9]:
def over_sampling_target_class(X_train, y_train):
    # Oversampling using SMOTE
    from imblearn.over_sampling import SMOTE
    os = SMOTE(random_state = 0)

    columns = X_train.columns
    os_data_X, os_data_y = os.fit_resample(X_train, y_train)

    os_data_X = pd.DataFrame(data = os_data_X, columns = columns)
    os_data_y = pd.DataFrame(data = os_data_y, columns = ['y'])

    # We can check the numbers of our data
    print("Length of oversampled data is: ", len(os_data_X))
    print("Number of no subscription in oversampled data is: ", len(os_data_y[os_data_y['y'] == 0]))
    print("Number of subscription: ", len(os_data_y[os_data_y['y'] == 1]))
    print("Proportion of no subscription in oversampled data is: ", len(os_data_y[os_data_y['y'] == 0])/len(os_data_X))
    print("Proportion of subscription is: ", len(os_data_y[os_data_y['y'] == 1])/len(os_data_X))

    X_train = os_data_X
    y_train = os_data_y

    return X_train, y_train

In [10]:
def training_basic_classifier(X_train, y_train):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=101)
    model.fit(X_train, y_train)

    return model

In [11]:
def predict_on_test_data(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [13]:
def get_metrics(y_true, y_pred, y_pred_prob):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    entropy = log_loss(y_true, y_pred_prob)
    return {'accuracy:' : round(acc, 2), 'precision: ': round(prec, 2), 'recall: ': round(recall, 2), 'entropy: ': round(entropy, 2)}

In [14]:
def create_roc_auc_plot(clf, X_data, y_data):
    import matplotlib.pyplot as plt
    from sklearn import metrics
    metrics.plot_roc_curve(clf, X_data, y_data)
    plt.savefig('roc_auc_curve.png')

In [16]:
def create_confusion_matrix_plot(clf, X_test, y_test):
    import matplotlib.pyplot as plt
    from sklearn.metrics import plot_confusion_matrix
    plot_confusion_matrix(clf, X_test, y_test)
    plt.savefig('confusion_matrix.png')

In [18]:
def hyper_parameter_tuning(X_train, y_train):
    # Define random hyperparameters
    n_estimators = [5, 21, 50, 100] # Number of trees in RF
    max_features = ['auto', 'sqrt'] # Number of features in consideration at every split
    max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # Maximum number of levels allowed in each decision tree
    min_samples_split = [2, 6, 10] # Minimum sample number to split a node
    min_samples_leaf = [1, 3, 4] # Minimum sample number that can be stored in a leaf node
    bootstrap = [True, False] # Method used to sample data points

    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }

    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier()
    model_tuning = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid,
                                      n_iter = 100, cv = 5, verbose = 2, random_state = 35, n_jobs = -1)
    model_tuning.fit(X_train, y_train)

    print('Random grid: ', random_grid, '\n')
    # Best parameters
    print('Best Parameters: ', model_tuning.best_params_, '\n')

    best_params = model_tuning.best_params_

    n_estimators = best_params['n_estimators']
    min_samples_split = best_params['min_samples_split']
    min_samples_leaf = best_params['min_samples_leaf']
    max_features = best_params['max_features']
    max_depth = best_params['max_depth']
    bootstrap = best_params['bootstrap']

    model_tuned = RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split,
                                         min_samples_leaf = min_samples_leaf, max_features = max_features,
                                         max_depth = max_depth, bootstrap = bootstrap)
    
    model_tuned.fit(X_train, y_train)

    return model_tuned, best_params