In [1]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Undersampling:
    
Undersampling involves randomly removing samples from the majority class to achieve a balanced class distribution.
One common method is random undersampling, where you randomly select a subset of the majority class samples.


In [2]:
df = pd.read_csv('12_removed_all_nan_with_mean.csv', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137386 entries, 0 to 137385
Data columns (total 34 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   census_tract                              137386 non-null  float64
 1   action_taken                              137386 non-null  int64  
 2   loan_type                                 137386 non-null  int64  
 3   lien_status                               137386 non-null  int64  
 4   reverse_mortgage                          137386 non-null  int64  
 5   open_end_line_of_credit                   137386 non-null  int64  
 6   loan_amount                               137386 non-null  int64  
 7   combined_loan_to_value_ratio              137386 non-null  float64
 8   interest_rate                             137386 non-null  float64
 9   total_loan_costs                          137386 non-null  float64
 10  origination_charges 

In [4]:
X = df.drop('action_taken', axis=1)
y = df['action_taken']

In [5]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Apply Random Undersampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

# Logistic regression

In [7]:
# Train a logistic regression model on the resampled training data
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

In [8]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [9]:
# Calculate and print the classification report
report = classification_report(y_test, y_pred, zero_division=1)

print(report)

              precision    recall  f1-score   support

           1       1.00      0.00      0.00     22061
           3       0.20      1.00      0.33      5417

    accuracy                           0.20     27478
   macro avg       0.60      0.50      0.16     27478
weighted avg       0.84      0.20      0.06     27478



# Conclusion

class is now predicted, but 1 less so. Not good!

# Logistic regression with regularisation

In [12]:
unique_classes = df['action_taken'].unique()
print(unique_classes)

[1 3]


In [11]:
# Apply Random Undersampling
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

# Create a logistic regression model
model = LogisticRegression()

# Fit the model on the resampled training data
model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate and print the classification report
report = classification_report(y_test, y_pred, zero_division=1)
print(report)


              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       1.00      0.00      0.00   22061.0
           3       1.00      0.00      0.00    5417.0

    accuracy                           1.00   27478.0
   macro avg       0.67      0.33      0.00   27478.0
weighted avg       1.00      0.00      0.00   27478.0

              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       1.00      0.00      0.00   22061.0
           3       1.00      0.00      0.00    5417.0

    accuracy                           1.00   27478.0
   macro avg       0.67      0.33      0.00   27478.0
weighted avg       1.00      0.00      0.00   27478.0

AUC-ROC: 0.49620297361846066
Precision-Recall AUC: 0.8064112566873878


In [None]:
# Create a regularized logistic regression model with L2 regularization
logistic_reg = LogisticRegression(penalty='l2', C=1.0)

# Fit the model on the resampled training data
logistic_reg.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_prob = logistic_reg.predict_proba(X_test)[:, 1]

# Calculate and print the classification report
report = classification_report(y_test, (y_prob > 0.5).astype(int), zero_division=1)
print(report)

# Calculate AUC-ROC
roc_auc = roc_auc_score(y_test, y_prob)
print("AUC-ROC:", roc_auc)

# Calculate Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, y_prob, pos_label=1)
pr_auc = auc(recall, precision)
print("Precision-Recall AUC:", pr_auc)


# Conclusion

It seems that the model is performing poorly in terms of precision, recall, and F1-score, especially for classes 0 and 3. 
The extremely low values for precision, recall, and F1-score for class 0 indicate that the model is unable to make 
meaningful predictions for that class, likely due to a lack of true samples in the test set.

The AUC-ROC value of 0.9177 indicates that the model's ability to distinguish between positive and negative instances 
is relatively good, but this metric can be misleading when dealing with imbalanced datasets.

The Precision-Recall AUC of 0.6182 suggests that the model's performance in terms of precision and recall trade-off is 
suboptimal, and there is room for improvement.