### Create handle to workspace

In [1]:
import pandas as pd
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="91b522a4-4851-4a71-8909-1acedddd997b",
    resource_group_name="smart_credit",
    workspace_name="smartcreditwkspc",
)

### Upload data to cloud storage

In [2]:
# Load data
rejected_data = pd.read_csv('https://smartcreditstrage.blob.core.windows.net/smartcontainer/rejected_2007_to_2018Q4.csv')
accepted_data = pd.read_csv('https://smartcreditstrage.blob.core.windows.net/smartcontainer/accepted_2007_to_2018Q4.csv')
    

# Label datasets: 1 for accepted and 0 for rejected
accepted_data['label'] = 1
rejected_data['label'] = 0

# Rename columns in rejected dataset to match accepted dataset
rejected_data.rename(columns={'Amount Requested': 'loan_amnt', 'Debt-To-Income Ratio': 'dti'}, inplace=True)

# Select and align columns for merging
selected_columns = ['home_ownership', 'annual_inc', 'loan_amnt', 'dti', 'emp_length', 'label']
for column in selected_columns:
    if column not in rejected_data.columns:
        rejected_data[column] = None

# Merge datasets
merged_data = pd.concat([accepted_data[selected_columns], rejected_data[selected_columns]], axis=0).reset_index(drop=True)


  accepted_data = pd.read_csv('https://smartcreditstrage.blob.core.windows.net/smartcontainer/accepted_2007_to_2018Q4.csv')


## Preprocessing

In [3]:
# Handle missing values
merged_data.dropna(how='all', inplace=True)

for column in ['home_ownership', 'emp_length']:
    merged_data[column].fillna(merged_data[column].mode()[0], inplace=True)
for column in ['annual_inc', 'loan_amnt']:
    merged_data[column].fillna(merged_data[column].median(), inplace=True)

# Convert dti values to numerical format and handle missing values
merged_data['dti'] = pd.to_numeric(merged_data['dti'], errors='coerce')
merged_data['dti'].fillna(merged_data['dti'].median(), inplace=True)
merged_data.head()

Unnamed: 0,home_ownership,annual_inc,loan_amnt,dti,emp_length,label
0,MORTGAGE,55000.0,3600.0,5.91,10+ years,1
1,MORTGAGE,65000.0,24700.0,16.06,10+ years,1
2,MORTGAGE,63000.0,20000.0,10.78,10+ years,1
3,MORTGAGE,110000.0,35000.0,17.06,10+ years,1
4,MORTGAGE,104433.0,10400.0,25.37,3 years,1


In [4]:
import os
import argparse
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import joblib
from azureml.core import Experiment, Workspace

In [5]:

label_encoder=LabelEncoder()
merged_data['home_ownership'] = label_encoder.fit_transform(merged_data['home_ownership'])
merged_data['emp_length'] = label_encoder.fit_transform(merged_data['emp_length'])

merged_data.head()

Unnamed: 0,annual_inc,loan_amnt,dti,label,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,...,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year
0,55000.0,3600.0,5.91,1,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,65000.0,24700.0,16.06,1,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,63000.0,20000.0,10.78,1,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,110000.0,35000.0,17.06,1,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,104433.0,10400.0,25.37,1,False,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


This code checks the data type of each column and converts the columns with boolean values to integers (1 for True and 0 for False).

In [7]:
# Shuffle the DataFrame
merged_data_shuffled = merged_data.sample(frac=1, random_state=42)  # Setting a random seed for reproducibility

# Reset the index of the shuffled DataFrame
merged_data_shuffled = merged_data_shuffled.reset_index(drop=True)

In [8]:
# Splitting data into test and train

X = merged_data_shuffled.drop(columns=['label']) 
Y = merged_data_shuffled['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:
print(X_train[:5])

          annual_inc  loan_amnt    dti  home_ownership_ANY  \
1032342      65000.0    22000.0  17.84                   0   
5041893      65000.0     3300.0  17.84                   0   
8100913      65000.0    30000.0  17.84                   0   
6503001      65000.0     5000.0  17.84                   0   
28753842     65000.0    15000.0  17.84                   0   

          home_ownership_MORTGAGE  home_ownership_NONE  home_ownership_OTHER  \
1032342                         1                    0                     0   
5041893                         1                    0                     0   
8100913                         1                    0                     0   
6503001                         1                    0                     0   
28753842                        1                    0                     0   

          home_ownership_OWN  home_ownership_RENT  emp_length_1 year  \
1032342                    0                    0                  0   
50

In [11]:
# set name for logging
mlflow.set_experiment("Smart credit")
# enable autologging with MLflow
mlflow.sklearn.autolog()

In [15]:
# 3. Model Training and Evaluation

# Train logistic regression model

mlflow.start_run()

classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(X_train, y_train)

# Evaluate model
y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))
# Stop logging for this model
mlflow.end_run()



              precision    recall  f1-score   support

           0       0.99      1.00      0.99   5530092
           1       1.00      0.87      0.93    451797

    accuracy                           0.99   5981889
   macro avg       0.99      0.93      0.96   5981889
weighted avg       0.99      0.99      0.99   5981889



In [16]:
# Train Gradient Boosting Classifier

mlflow.start_run()
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
# Stop logging for this model
mlflow.end_run()



In [None]:
# Train  AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

mlflow.start_run()
ada = AdaBoostClassifier()

ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)

print(classification_report(y_test, y_pred))
# Stop logging for this model
mlflow.end_run()

home_ownership : 'MORTGAGE', 'RENT', 'OWN'
annual_inc = np.random.uniform(20000, 150000)  # Random income between 20k and 150k
loan_amnt = np.random.uniform(500, 40000)  # Random loan amount between $500 and $40k
dti = np.random.uniform(0.1, 0.9)  # Random dti between 10% and 90%
emp_length = np.random.choice(['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years'
,6 years', '7 years', '8 years', '9 years', '10+ years'])