# Set up

Python librairies imports :

In [1]:
# File system management
import os

# Data manipulation
import numpy as np
import pandas as pd

# Data algoritms
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn import metrics

from imblearn.over_sampling import SMOTE

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_theme(palette="Set1")

Define work location :

In [31]:
project_path = "/Users/victor/Documents/OPENCLASSROOMS/projet_7"

# Change directory
os.chdir(project_path)

Load data :

In [32]:
# Load data
data = pd.read_csv("data/processed/train_feature_engineering.csv", index_col=[0])

# Functions

# MLFlow

Set up of MLFlow tracking.

In [33]:
import mlflow
from mlflow import log_metric, log_param, log_artifacts

# Create experiment
experiment_name = 'credit_scoring'
mlflow.set_experiment(experiment_name)
mlflow.end_run()

# Print paths
artifact_path = mlflow.get_artifact_uri()
uri = mlflow.tracking.get_tracking_uri()
print(artifact_path)
print(uri)

2023/01/04 17:25:19 INFO mlflow.tracking.fluent: Experiment with name 'credit_scoring' does not exist. Creating a new experiment.


file:///Users/victor/Documents/OPENCLASSROOMS/projet_7/mlruns/982388865821309393/a8a49621bb4d4816b0f211df5fc53c05/artifacts
file:///Users/victor/Documents/OPENCLASSROOMS/projet_7/mlruns


In [34]:
# Function to log model to mlflow
def log_mlflow(model, model_info):

    # Track params and metrics 
    with mlflow.start_run() as run:
        mlflow.set_tag("model_name", model_info['name'])
        mlflow.log_param("Train size", model_info['train_size'])
        mlflow.log_metric("AUC", model_info['score_auc'])

        # Save model to artifacts
        mlflow.sklearn.log_model(model, model_info['name'])

    mlflow.end_run()

# Data preparation

In [13]:
# Define target and features
TARGET = 'TARGET'
target = data[TARGET].to_numpy()
features = data.drop(columns='TARGET').to_numpy()
features_names = data.drop(columns='TARGET').columns.to_list()

In [14]:
# Check percentage of missing values
print(data.isna().sum().sum()/data.size*100)

23.70605835194158


In [15]:
# Create imputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit to data
imp_mean.fit(features)

# Transform data
features = imp_mean.transform(features)

In [16]:
# # Check class weights
display(data[TARGET].value_counts())

0    282682
1     24825
Name: TARGET, dtype: int64

In [17]:
# Split dataset
x_train, x_test, y_train, y_test = train_test_split(features, target, train_size=0.8, random_state=0)

In [None]:
# # Balance data
# sm_data_x, sm_data_y = SMOTE(random_state=0).fit_resample(x_train, y_train)
# sm_data_x = pd.DataFrame(data=sm_data_x, columns=features_names.columns)
# sm_data_y = pd.DataFrame(data=sm_data_y, columns=[TARGET])

# # Check balanced data
# print("length of oversampled data is ",len(sm_data_x))
# print("Number of no subscription in oversampled data",len(sm_data_y[sm_data_y[TARGET]==1]))
# print("Number of subscription",len(sm_data_y[sm_data_y[TARGET]==0]))
# print("Proportion of no subscription data in oversampled data is ",len(sm_data_y[sm_data_y[TARGET]==1])/len(sm_data_x))
# print("Proportion of subscription data in oversampled data is ",len(sm_data_y[sm_data_y[TARGET]==0])/len(sm_data_x))

# Classification

In [18]:
# Create logistic regression model
clf = LogisticRegression(random_state=0)

# Fit to data
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Use model to predict probability that given y value is 1
y_pred_proba = clf.predict_proba(x_test)[::,1]

# Calculate AUC of model
auc = metrics.roc_auc_score(y_test, y_pred_proba)

# Print AUC score
print(auc)

0.5257864106514176


In [37]:
# Create model info
clf_info = {'name': 'LogisticRegression',
            'train_size': x_train.size,
            'score_auc': auc}

# Log mlflow
log_mlflow(clf, clf_info)

- dummies? regression poly?
- randomforest
- xgboost
- lightGBM