In [1]:
# Importer
import classes.utils as utils
from classes.splitter import Splitter


utils.set_parent_directory_as_working_directory()

# TODO: Move this to a config file
# Importing
DATA_FOLDER = "./data"


FE_DATA_PATH = DATA_FOLDER +'/fe_data.csv'
DATES_DATA_PATH = DATA_FOLDER +'/dates_data.csv'


# 0 Introduction
In this notebook we will develop our first model. We are going to assume that we have 1 year of loans that have finished (finished_d = issued date + total length of loan), which basically place ourselves on 2011-05-01.


# 1 Splitting data
Before we get hands on with the modelling, we need to split the data into train and test sets. As we mentioned in the preprocessing notebook we will be using the create variable  'finished_d' to 

We will use the train set to train the model and the test set to evaluate the model. We will use the train_test_split function from sklearn to split the data. We will use 80% of the data for training and 20% for testing.

In [2]:
splitter_name = "splitter"

splitter = Splitter(
    name = splitter_name
    , data_path = FE_DATA_PATH
    , date_cols = []
    , target_variable = 'loan_status'
    , destination_directory = DATA_FOLDER
    , dates_data_path = DATES_DATA_PATH
    , column_to_split_by = 'finished_d'
    , test_size = 0.3
    , random_state = 47
)

splitter.execute()
splitter.split_data_filtered(number_of_months=12)


Data loaded from ./data/fe_data.csv
Dates data loaded from ./data/dates_data.csv
Date column finished_d added to the data
Data filtered by 12 months


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.drop(columns=self.column_to_split_by, inplace=True)


Test and train attributes defined 0.3.
        Test size: 597
        Train size: 1393


This object contains x_train, x_test, y_train and y_test, for the first 12 months, later we can try with more months bu just changing "number_of_months".

# 2 Modelling first year of data

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import numpy as np


def random_grid_search(model, param_distributions, X_train, y_train,  random_state):
    """
    Perform a random grid search using scikit-learn's RandomizedSearchCV class.

    Parameters:
    - model: a scikit-learn model object
    - param_distributions: a dictionary containing hyperparameter names and distributions
    - X_train: training input data
    - y_train: training output data
    - n_iter: number of parameter settings that are sampled
    - cv: number of cross-validation folds

    Returns:
    - best_estimator: the best estimator found during the search
    - best_params: the hyperparameters of the best estimator
    """
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state= random_state)
    # Create a RandomizedSearchCV object

    search = RandomizedSearchCV(
        model, param_distributions, cv=cv, scoring='recall', n_iter=10)

    # Fit the RandomizedSearchCV object to the data
    search.fit(X_train, y_train)

    # Print the best parameters and score
    print("Best parameters: {}".format(search.best_params_))
    print("Best cross-validation score: {:.2f}".format(search.best_score_))

    # Get the best estimator and its parameters
    best_estimator = search.best_estimator_
    best_params = search.best_params_

    return best_estimator, best_params

def cross_val_train_predict(model, X_train, y_train, X_test, scoring_metric, seed=SEED):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
    scores = cross_val_score(model, X_train, y_train,
                             scoring=scoring_metric, cv=cv, n_jobs=-1)
    print('Mean Recall in Train: %.3f (%.3f)' %
          (np.mean(scores), np.std(scores)))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred




In [7]:
import mlflow
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

def set_experiment(experiment_name):
    # Set the experiment name and tracking URI
    mlflow.set_experiment(experiment_name)
    mlflow.set_tracking_uri('http://localhost:5000')
    client = mlflow.tracking.MlflowClient()
    experiment = mlflow.get_experiment_by_name(experiment_name)
    run = client.create_run(experiment.experiment_id)




def run_experiment(experiment_name
                   , model_class
                   , params
                   , splitter
                   , log_model=False
                   , params_distributions=None
                   , random_satate=47):
    
    set_experiment(experiment_name)
    mlflow.sklearn.autolog(log_models=log_model)
    
    # Log the evaluation metrics and model parameters in MLflow
    with mlflow.start_run():
        model = model_class(**params)  # create a new instance of the model class
        model.fit(splitter.X_train, splitter.y_train)
        y_pred = model.predict(splitter.X_test)




run_experiment('First year'
               , DecisionTreeClassifier
               , {'max_depth': 5
                  , 'random_state' : 47}
               , splitter
               )




In [10]:
def delete_experiment(experiment_name):
    client = mlflow.tracking.MlflowClient()
    experiment = mlflow.get_experiment_by_name(experiment_name)

    # Checks if the experiment exists
    if experiment is None:
        print(f"Experiment '{experiment_name}' does not exist.")
        return

    experiment = mlflow.get_experiment_by_name(experiment_name)
    client.delete_experiment(experiment.experiment_id)

delete_experiment('base model')

RestException: RESOURCE_DOES_NOT_EXIST: No Experiment with id=1 exists