In [1]:
from azureml.core import Workspace, Datastore, Dataset,Experiment
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Accessing the ml workspace
ws = Workspace.from_config(path="./config")
# Accessing datastore
az_store = Datastore.get(ws, "azuremlds01")

In [3]:
# Path for data
csv_path = [(az_store, "Loan+Approval+Prediction.csv")]
loan_dataset = Dataset.Tabular.from_delimited_files(path=csv_path)
# Register the dataset
loan_dataset = loan_dataset.register(workspace=ws,
                                     name="Loan Applications Using SDK",
                                     create_new_version=True)

df = loan_dataset.to_pandas_dataframe()

In [5]:
# Creating Experiment
experiment = Experiment(workspace = ws, name = 'Loan-sdk-Exp01')

In [6]:
# Starting the Experiment
new_run = experiment.start_logging()

In [7]:
loan = df.copy()

In [8]:
loan.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status             bool
dtype: object

In [9]:
total_observation = len(loan)
null_df = loan.isnull().sum()

In [10]:
new_run.log("Total Observations",total_observation)
for col in loan.columns:
    new_run.log(col, null_df[col])

In [11]:
loan.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     2
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
loan.shape

(614, 13)

In [13]:
X = loan.drop(['Loan_Status'],axis=1)
y = loan[['Loan_Status']]

In [14]:
X.shape , y.shape

((614, 12), (614, 1))

In [15]:
def clean_missing_data(df):
    df.drop(['Loan_ID','Gender','Property_Area'],axis=1,inplace=True)
    numerical_col = df.select_dtypes(include = ['number']).columns
    categorical_col = df.select_dtypes(include = ['object','category','bool']).columns
    for i in numerical_col:
        df[i] = df[i].fillna(df[i].mean())
    for j in categorical_col:
        df[j] = df[j].fillna(df[j].mode()[0])
        
    return df
    
    

In [16]:
X = clean_missing_data(X)
X

Unnamed: 0,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,False,0,Graduate,False,5849,0.0,146.412162,360.0,1.0
1,True,1,Graduate,False,4583,1508.0,128.000000,360.0,1.0
2,True,0,Graduate,True,3000,0.0,66.000000,360.0,1.0
3,True,0,Not Graduate,False,2583,2358.0,120.000000,360.0,1.0
4,False,0,Graduate,False,6000,0.0,141.000000,360.0,1.0
...,...,...,...,...,...,...,...,...,...
609,False,0,Graduate,False,2900,0.0,71.000000,360.0,1.0
610,True,3+,Graduate,False,4106,0.0,40.000000,180.0,1.0
611,True,1,Graduate,False,8072,240.0,253.000000,360.0,1.0
612,True,2,Graduate,False,7583,0.0,187.000000,360.0,1.0


In [17]:
X.isnull().sum()

Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
dtype: int64

In [18]:
def preprocessing(df):
    
    num_col = df.select_dtypes(include=['number']).columns
    cat_col = df.select_dtypes(include = ['object','bool','category']).columns
    scaler = MinMaxScaler()
    scaler_fitted = scaler.fit(df[num_col])
    df[num_col] = scaler_fitted.transform(df[num_col])
    
    df = pd.get_dummies(df,columns = cat_col,drop_first = True)
    
    return df

In [19]:
X = preprocessing(X)

In [20]:
X

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_True,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_True
0,0.070489,0.000000,0.198860,0.743590,1.0,0,0,0,0,0,0
1,0.054830,0.036192,0.172214,0.743590,1.0,1,1,0,0,0,0
2,0.035250,0.000000,0.082489,0.743590,1.0,1,0,0,0,0,1
3,0.030093,0.056592,0.160637,0.743590,1.0,1,0,0,0,1,0
4,0.072356,0.000000,0.191027,0.743590,1.0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
609,0.034014,0.000000,0.089725,0.743590,1.0,0,0,0,0,0,0
610,0.048930,0.000000,0.044863,0.358974,1.0,1,0,0,1,0,0
611,0.097984,0.005760,0.353111,0.743590,1.0,1,1,0,0,0,0
612,0.091936,0.000000,0.257598,0.743590,1.0,1,0,1,0,0,0


In [21]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234, stratify=y)


In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = RandomForestClassifier(random_state=123)
trained_model = rfc.fit(X_train, Y_train)
Y_predict = rfc.predict(X_test)

  


In [27]:
Y_prob = rfc.predict_proba(X_test)[:, 1]

In [28]:
from sklearn.metrics import confusion_matrix
cm    = confusion_matrix(Y_test, Y_predict)
score = rfc.score(X_test, Y_test)

In [29]:
new_run.log("accuracy", score)

In [30]:
train_enc_cols = X.columns

In [31]:
import joblib
model_file = './outputs/models.pkl'

joblib.dump(value=[train_enc_cols, trained_model], 
            filename=model_file)

['./outputs/models.pkl']

In [32]:
# Complete the run
new_run.complete()

In [33]:
# Get the Run IDs from the experiment
list(experiment.get_runs())


[Run(Experiment: Loan-sdk-Exp01,
 Id: ba0ebbdf-e6ea-475b-b2a4-8479a596cf87,
 Type: None,
 Status: Completed),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 0d4617f7-e535-4339-a7a6-9a04f9f09750,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 1b18bc23-9cf7-4a05-b044-6747ec055141,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 6ee16aec-23dd-4db5-95ac-0e107a235332,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 446f30e7-d72b-47a6-aac9-b339ac91164c,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 23d57c47-8a23-4f86-8466-9445efee9bd7,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: bbcbafd6-add2-43ee-8bf0-15bfb2ab328c,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 2743cb9f-7e14-4e88-8222-ccbb47426b4c,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-Exp01,
 Id: 3aea2eae-60d7-4c26-9726-12ad480f5a7c,
 Type: None,
 Status: Running),
 Run(Experiment: Loan-sdk-