# Task 2
customers house loan prediction

In [1]:
import os
import pandas as pd


In [2]:
original_data = pd.read_csv('data/customers_loan.csv')
del original_data['Unnamed: 0']
original_data.head()

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,PAIDOFF,1000,30,9/8/2016,10/7/2016,45,High School or Below,male
1,PAIDOFF,1000,30,9/8/2016,10/7/2016,33,Bechalor,female
2,PAIDOFF,1000,15,9/8/2016,9/22/2016,27,college,male
3,PAIDOFF,1000,30,9/9/2016,10/8/2016,28,college,female
4,PAIDOFF,1000,30,9/9/2016,10/8/2016,29,college,male


In [3]:
original_data.dtypes

loan_status       object
Principal          int64
terms              int64
effective_date    object
due_date          object
age                int64
education         object
Gender            object
dtype: object

In [4]:
original_data["education"].value_counts()
# so apply lable encoding to education column

education
High School or Below    151
college                 149
Bechalor                 44
Master or Above           2
Name: count, dtype: int64

In [5]:
original_data["Gender"].value_counts()
# so that apply binay encoding to gender



Gender
male      294
female     52
Name: count, dtype: int64

In [6]:
# check null values
original_data.isnull().sum()

loan_status       0
Principal         0
terms             0
effective_date    0
due_date          0
age               0
education         0
Gender            0
dtype: int64

In [7]:
original_data["loan_status"].value_counts()
# so there is a class imbalance in the target variable

loan_status
PAIDOFF       260
COLLECTION     86
Name: count, dtype: int64

### Feature engineering

In [8]:
updated_df = original_data.copy()
# convert dates
updated_df['effective_date'] = pd.to_datetime(updated_df['effective_date'],format='%m/%d/%Y')
updated_df['due_date'] = pd.to_datetime(updated_df['due_date'],format='%m/%d/%Y')
updated_df.dtypes

loan_status               object
Principal                  int64
terms                      int64
effective_date    datetime64[ns]
due_date          datetime64[ns]
age                        int64
education                 object
Gender                    object
dtype: object

In [9]:
updated_df['DaysBetween'] = (updated_df['due_date'] - updated_df['effective_date']).dt.days
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29
...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59


In [10]:
# apply label encoding to education column
education_mapping = {
    'High School or Below': 0,
    'college': 1,
    'Bechalor': 2,
    'Master or Above': 3
}

# Apply the mapping to the 'education' column
updated_df['education_encoded'] = updated_df['education'].map(education_mapping)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,education_encoded
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29,0
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29,2
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14,1
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29,1
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29,1
...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14,0
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29,0
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14,1
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59,1


In [11]:
# apply binary encoding to gender
updated_df['Gender_encoded'] = updated_df['Gender'].apply(lambda x: 1 if x == 'male' else 0)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,education_encoded,Gender_encoded
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29,0,1
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29,2,0
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14,1,1
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29,1,0
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29,1,1
...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14,0,1
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29,0,1
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14,1,1
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59,1,1


In [12]:
# min max norm :: Principal	terms    age	DaysBetween
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
columns_to_normalize = ['Principal', 'terms', 'age', 'DaysBetween']
scaler = MinMaxScaler()
updated_df[columns_to_normalize] = scaler.fit_transform(updated_df[columns_to_normalize])
updated_df


Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,education_encoded,Gender_encoded
0,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.818182,High School or Below,male,0.433962,0,1
1,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.454545,Bechalor,female,0.433962,2,0
2,PAIDOFF,1.000000,0.347826,2016-09-08,2016-09-22,0.272727,college,male,0.150943,1,1
3,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.303030,college,female,0.433962,1,0
4,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.333333,college,male,0.433962,1,1
...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,0.714286,0.347826,2016-09-11,2016-09-25,0.424242,High School or Below,male,0.150943,0,1
342,COLLECTION,1.000000,1.000000,2016-09-11,2016-10-10,0.212121,High School or Below,male,0.433962,0,1
343,COLLECTION,0.714286,0.347826,2016-09-12,2016-09-26,0.636364,college,male,0.150943,1,1
344,COLLECTION,1.000000,1.000000,2016-09-12,2016-11-10,0.303030,college,male,1.000000,1,1


In [13]:
# apply binary encoding to loan_status
updated_df['loan_status_encoded'] = updated_df['loan_status'].apply(lambda x: 1 if x == 'PAIDOFF' else 0)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,education_encoded,Gender_encoded,loan_status_encoded
0,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.818182,High School or Below,male,0.433962,0,1,1
1,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.454545,Bechalor,female,0.433962,2,0,1
2,PAIDOFF,1.000000,0.347826,2016-09-08,2016-09-22,0.272727,college,male,0.150943,1,1,1
3,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.303030,college,female,0.433962,1,0,1
4,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.333333,college,male,0.433962,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,0.714286,0.347826,2016-09-11,2016-09-25,0.424242,High School or Below,male,0.150943,0,1,0
342,COLLECTION,1.000000,1.000000,2016-09-11,2016-10-10,0.212121,High School or Below,male,0.433962,0,1,0
343,COLLECTION,0.714286,0.347826,2016-09-12,2016-09-26,0.636364,college,male,0.150943,1,1,0
344,COLLECTION,1.000000,1.000000,2016-09-12,2016-11-10,0.303030,college,male,1.000000,1,1,0


In [14]:
selected_features = updated_df[["loan_status_encoded",'Principal', 'terms', 'age', 'DaysBetween', 'education_encoded', "Gender_encoded"]]
selected_features.head()

Unnamed: 0,loan_status_encoded,Principal,terms,age,DaysBetween,education_encoded,Gender_encoded
0,1,1.0,1.0,0.818182,0.433962,0,1
1,1,1.0,1.0,0.454545,0.433962,2,0
2,1,1.0,0.347826,0.272727,0.150943,1,1
3,1,1.0,1.0,0.30303,0.433962,1,0
4,1,1.0,1.0,0.333333,0.433962,1,1


In [16]:
# apply oversampling to balance the classes
from imblearn.over_sampling import RandomOverSampler
X = selected_features.drop('loan_status_encoded', axis=1)
y = selected_features['loan_status_encoded']
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
resampled_counts = y_resampled.value_counts()
resampled_counts

loan_status_encoded
1    260
0    260
Name: count, dtype: int64

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
# train test split
X = df_resampled.drop(columns=['loan_status_encoded'])
y = df_resampled['loan_status_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model on the training data
logreg_model.fit(X_train, y_train)

# Predict outcomes for the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Add predictions to the test_data for inspection
test_data['Predicted_Outcome'] = y_pred

# Print first few rows of test data with predictions
print(test_data.head())


Accuracy: 0.5865
Confusion Matrix:
[[33 15]
 [28 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.69      0.61        48
           1       0.65      0.50      0.57        56

    accuracy                           0.59       104
   macro avg       0.60      0.59      0.59       104
weighted avg       0.60      0.59      0.58       104

     Principal     terms       age  DaysBetween  education_encoded  \
275        1.0  1.000000  0.575758     0.433962                  1   
93         1.0  1.000000  0.303030     0.433962                  1   
6          1.0  1.000000  0.303030     0.433962                  1   
167        1.0  0.347826  0.666667     0.150943                  1   
90         1.0  1.000000  0.272727     1.000000                  1   

     Gender_encoded  loan_status_encoded  Predicted_Outcome  
275               1                    0                  0  
93                1                    1           

### Random forest

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)  # You can tune n_estimators

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict outcomes for the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Add predictions to the test_data for inspection
test_data['Predicted_Outcome'] = y_pred

# Print first few rows of test data with predictions
print(test_data.head())


Accuracy: 0.7404
Confusion Matrix:
[[41  7]
 [20 36]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.85      0.75        48
           1       0.84      0.64      0.73        56

    accuracy                           0.74       104
   macro avg       0.75      0.75      0.74       104
weighted avg       0.76      0.74      0.74       104

     Principal     terms       age  DaysBetween  education_encoded  \
275        1.0  1.000000  0.575758     0.433962                  1   
93         1.0  1.000000  0.303030     0.433962                  1   
6          1.0  1.000000  0.303030     0.433962                  1   
167        1.0  0.347826  0.666667     0.150943                  1   
90         1.0  1.000000  0.272727     1.000000                  1   

     Gender_encoded  loan_status_encoded  Predicted_Outcome  
275               1                    0                  1  
93                1                    1           

In [25]:
import mlflow

# Start an MLflow run
with mlflow.start_run(run_name="test-akalanka"):
    mlflow.log_param("param1", 5)
    mlflow.log_metric("metric1", 0.89)

    # Example: Save a model artifact (optional)
    # mlflow.sklearn.log_model(model, "model")

print("Run information has been logged in MLflow.")


Run information has been logged in MLflow.
