# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('https://s3.amazonaws.com/udacity-hosted-downloads/ud651/prosperLoanData.csv')

In [3]:
df.shape

(113937, 81)

# Data Cleaning 

# Cheacking Duplicate rows in the dataset

In [4]:
null_values = df.isnull().sum()

# Print the null values for each column
for column, value in null_values.items():
    print(f"{column}:{value}")

ListingKey:0
ListingNumber:0
ListingCreationDate:0
CreditGrade:84984
Term:0
LoanStatus:0
ClosedDate:58848
BorrowerAPR:25
BorrowerRate:0
LenderYield:0
EstimatedEffectiveYield:29084
EstimatedLoss:29084
EstimatedReturn:29084
ProsperRating (numeric):29084
ProsperRating (Alpha):29084
ProsperScore:29084
ListingCategory (numeric):0
BorrowerState:5515
Occupation:3588
EmploymentStatus:2255
EmploymentStatusDuration:7625
IsBorrowerHomeowner:0
CurrentlyInGroup:0
GroupKey:100596
DateCreditPulled:0
CreditScoreRangeLower:591
CreditScoreRangeUpper:591
FirstRecordedCreditLine:697
CurrentCreditLines:7604
OpenCreditLines:7604
TotalCreditLinespast7years:697
OpenRevolvingAccounts:0
OpenRevolvingMonthlyPayment:0
InquiriesLast6Months:697
TotalInquiries:1159
CurrentDelinquencies:697
AmountDelinquent:7622
DelinquenciesLast7Years:990
PublicRecordsLast10Years:697
PublicRecordsLast12Months:7604
RevolvingCreditBalance:7604
BankcardUtilization:7604
AvailableBankcardCredit:7544
TotalTrades:7544
TradesNeverDelinquent

In [5]:
# Data cleaning
duplicate_rows = df[df.duplicated()]

# Print the duplicate rows, if any
if not duplicate_rows.empty:
    print("Duplicate Rows:")
    print(duplicate_rows)
else:
    print("No duplicate Data found.")


No duplicate Data found.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 81 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ListingKey                           113937 non-null  object 
 1   ListingNumber                        113937 non-null  int64  
 2   ListingCreationDate                  113937 non-null  object 
 3   CreditGrade                          28953 non-null   object 
 4   Term                                 113937 non-null  int64  
 5   LoanStatus                           113937 non-null  object 
 6   ClosedDate                           55089 non-null   object 
 7   BorrowerAPR                          113912 non-null  float64
 8   BorrowerRate                         113937 non-null  float64
 9   LenderYield                          113937 non-null  float64
 10  EstimatedEffectiveYield              84853 non-null   float64
 11  EstimatedLoss

In [7]:
date_columns = ["ClosedDate", "LoanOriginationDate", "ListingCreationDate"]

# Drop rows where any of the specified columns are null

# Convert specified columns to datetime format
for column in date_columns:
    df[column] = pd.to_datetime(df[column], errors='coerce')
df.shape

(113937, 81)

# Loan Default Status

In [8]:
df['defaultstatus'] = df.apply(
    lambda row: 1 if not pd.isna(row['ClosedDate']) and row['LoanCurrentDaysDelinquent'] > 180 else 0, 
    axis=1
)

In [9]:
value_counts = df['defaultstatus'].value_counts()
count_zeros = value_counts.get(0, 0)
count_ones = value_counts.get(1, 0)

print(f"Number of 0's in defaultstatus: {count_zeros}")
print(f"Number of 1's in defaultstatus: {count_ones}")

Number of 0's in defaultstatus: 98874
Number of 1's in defaultstatus: 15063


# LoanTenure

In [10]:
df["LoanTenure"] = ((df["ClosedDate"].dt.year - df["LoanOriginationDate"].dt.year) * 12) - (df["ClosedDate"].dt.month - df["LoanOriginationDate"].dt.month)

In [11]:
df["LoanTenure"].describe()

count    55069.000000
mean        19.637037
std         14.448609
min        -27.000000
25%         10.000000
50%         18.000000
75%         35.000000
max         59.000000
Name: LoanTenure, dtype: float64

In [12]:
df.drop(date_columns, axis=1, inplace=True)
df.drop("LoanTenure", axis=1, inplace=True)
df.rename(columns={"Term" : "LoanTenure"}, inplace=True)

# EMI

In [13]:
emi_cols = ["LP_CustomerPayments", "LP_CustomerPrincipalPayments"]

In [14]:
def cal_EMI(P, r, n):
  P = P.values
  r = r.values
  n = n.values
  #print(P.shape[0])
  result_1 = np.empty(0)
  result_2 = np.empty(0)
  result = np.empty(0)
  for i in range(P.shape[0]):
    #print(P[i])
    #print(r[i])
    #print(n[i])
    # EMI = P × r × (1 + r) ^ n / ((1 + r) ^ n – 1)
    #print(P[i] * (1 + r[i]))
    result_1 = np.append(result_1, P[i] * r[i] * np.power((1 + r[i]),n[i]))
    result_2 = np.append(result_2, np.power((1 + r[i]),n[i]) - 1)
    result = np.append(result, (result_1[i] / result_2[i]))

  return result


In [15]:
df["EMI"] = cal_EMI(df["LP_CustomerPrincipalPayments"], df["BorrowerRate"], df["LoanTenure"])

print(df["EMI"].describe())

count    113929.000000
mean        558.230883
std         720.251611
min           0.000000
25%          90.728391
50%         287.662239
75%         739.577224
max        8780.054436
Name: EMI, dtype: float64


In [16]:
print(df["EMI"])

0         1496.764570
1            0.000000
2          825.406311
3          413.021192
4          326.288414
             ...     
113932     418.123199
113933     226.812921
113934      39.377582
113935    3907.503625
113936       5.053186
Name: EMI, Length: 113937, dtype: float64


# Eligible Loan Amount

In [17]:
ela_cols = ['DebtToIncomeRatio', 'IncomeRange', 'IncomeVerifiable',
       'StatedMonthlyIncome']

df[ela_cols].head()

Unnamed: 0,DebtToIncomeRatio,IncomeRange,IncomeVerifiable,StatedMonthlyIncome
0,0.17,"$25,000-49,999",True,3083.333333
1,0.18,"$50,000-74,999",True,6125.0
2,0.06,Not displayed,True,2083.333333
3,0.15,"$25,000-49,999",True,2875.0
4,0.26,"$100,000+",True,9583.333333


In [18]:
df_new = df[df['IncomeVerifiable'] == True]

print(df_new.shape)
print(df.shape[0] - df_new.shape[0])

(105268, 80)
8669


In [19]:
# Columns to be used for eligibility calculation
ela_cols = ['DebtToIncomeRatio', 'IncomeRange', 'IncomeVerifiable', 'StatedMonthlyIncome']

# Filter data for only income verifiable entries
df = df[df['IncomeVerifiable'] == True]

# Function to calculate Eligible Loan Amount (ELA)
def eligible_loan_amnt(df):
    # Calculate Available Income
    ava_inc = df['StatedMonthlyIncome'] * 12 * 0.3 * df['LoanTenure']
    # Calculate Total Loan Amount
    total_loan_amnt = np.round(df['LoanOriginalAmount'] + (df['LoanOriginalAmount'] * df['BorrowerRate']) * df['LoanTenure'])
    # Calculate ELA
    ela = np.where(ava_inc <= 0, 0, np.where(total_loan_amnt <= ava_inc, total_loan_amnt, ava_inc))
    return ela


In [20]:
df['ELA'] = eligible_loan_amnt(df)

df['ELA'].describe()

count    105268.000000
mean      72332.802069
std       60883.574093
min           0.000000
25%       28660.000000
50%       50066.000000
75%       98408.000000
max      416990.000000
Name: ELA, dtype: float64

In [21]:
df.shape

(105268, 81)

In [22]:
def plot_by_woe(df_WoE, rotation_of_x_axis_labels=0):
    x = np.array(df_WoE.iloc[:, 0].apply(str))
    y = df_WoE['WoE']
    
    # Calculate ROI
    positive_woe = df_WoE[df_WoE['Outcome'] == 1]['WoE'].values
    negative_woe = df_WoE[df_WoE['Outcome'] == 0]['WoE'].values
    roi = positive_woe - negative_woe
    
    plt.figure(figsize=(18, 6))
    plt.plot(x, roi, marker='o', linestyle='--', color='k')
    plt.xlabel(df_WoE.columns[0])
    plt.ylabel('Preferred Return on Investment')
    plt.title('Preferred Return on Investment by ' + df_WoE.columns[0])
    plt.xticks(rotation=rotation_of_x_axis_labels)

# Example usage:
# plot_by_woe(df_WoE, rotation_of_x_axis_labels=45)


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105268 entries, 0 to 113936
Data columns (total 81 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ListingKey                           105268 non-null  object 
 1   ListingNumber                        105268 non-null  int64  
 2   CreditGrade                          27628 non-null   object 
 3   LoanTenure                           105268 non-null  int64  
 4   LoanStatus                           105268 non-null  object 
 5   BorrowerAPR                          105243 non-null  float64
 6   BorrowerRate                         105268 non-null  float64
 7   LenderYield                          105268 non-null  float64
 8   EstimatedEffectiveYield              77520 non-null   float64
 9   EstimatedLoss                        77520 non-null   float64
 10  EstimatedReturn                      77520 non-null   float64
 11  ProsperRating (num

# Preferred Return on Investment (PROI):

In [24]:
def calculate_proi(df):
    # Calculate ROI
    df['InterestAmount'] = (df['LoanOriginalAmount']*(df['BorrowerRate'])*(df['LoanTenure'] / 12))
    df['TotalAmount'] = (df['InterestAmount'] + df['LoanOriginalAmount'])
    df['ROI'] = (df['InterestAmount'] / df['LoanOriginalAmount']) / (df['LoanTenure'] / 12)
    print("ROI Statistics:")
    print(df['ROI'].describe())

    # Initialize PROI with the median of ROI
    df['PROI'] = df['ROI'].median()

    # Apply conditions using vectorized operations
    # Check out LP_CustomerPrincipalPayments
    df['PROI'] += np.where(df['LP_CustomerPrincipalPayments'] <= 1000, 0.05, 0)
    df['PROI'] -= np.where((df['LP_CustomerPrincipalPayments'] > 2000) & (df['LP_CustomerPrincipalPayments'] <= 10500), 0.05, 0)
    df['PROI'] -= np.where(df['LP_CustomerPrincipalPayments'] > 10500, 0.1, 0)

    # Check out ProsperRating (Alpha)
    df['PROI'] += np.where(df['ProsperRating (Alpha)'].isin(['C', 'D']), 0.05, 0)
    df['PROI'] -= np.where(df['ProsperRating (Alpha)'] == 'G', 0.05, 0)

    # Check out LoanOriginalAmount
    df['PROI'] -= np.where(df['LoanOriginalAmount'] <= 2000, 0.05, 0)
    df['PROI'] += np.where((df['LoanOriginalAmount'] > 19500) & (df['LoanOriginalAmount'] <= 25500), 0.05, 0)
    df['PROI'] += np.where(df['LoanOriginalAmount'] > 25500, 0.1, 0)

    # Check out LoanCurrentDaysDelinquent
    df['PROI'] += np.where(df['LoanCurrentDaysDelinquent'] >= 50, 0.05, 0)

    # Check out MonthlyLoanPayment
    df['PROI'] -= np.where(df['MonthlyLoanPayment'] <= 90, 0.05, 0)
    df['PROI'] += np.where((df['MonthlyLoanPayment'] > 360) & (df['MonthlyLoanPayment'] <= 750), 0.05, 0)

    print("Revised PROI Statistics:")
    print(df['PROI'].describe())

    return df['PROI']

# Example of how to call the function
# Assuming 'df' is your DataFrame loaded with the relevant columns
proi_values = calculate_proi(df)


ROI Statistics:
count    105268.000000
mean          0.190627
std           0.074131
min           0.000000
25%           0.131400
50%           0.180000
75%           0.249200
max           0.497500
Name: ROI, dtype: float64
Revised PROI Statistics:
count    105268.000000
mean          0.201229
std           0.071094
min           0.030000
25%           0.130000
50%           0.180000
75%           0.280000
max           0.430000
Name: PROI, dtype: float64


# Checking null values in the columns
Filling with mean and mode

In [25]:
columns_with_null = df.columns[df.isnull().any()]
for column in columns_with_null:
    # Fill null values in numerical columns with the mean
    if df[column].dtype in ['float64', 'int64']:
        df[column].fillna(df[column].mean(), inplace=True)
    # Fill null values in categorical columns with the most frequent category
    elif df[column].dtype == 'object':
        # For date columns, fill null values with "Unknown"
        if column in ['DateCreditPulled', 'FirstRecordedCreditLine']:
            df[column].fillna("None", inplace=True)
        else:
            df[column].fillna(df[column].mode()[0], inplace=True)

In [26]:
df

Unnamed: 0,ListingKey,ListingNumber,CreditGrade,LoanTenure,LoanStatus,BorrowerAPR,BorrowerRate,LenderYield,EstimatedEffectiveYield,EstimatedLoss,...,InvestmentFromFriendsCount,InvestmentFromFriendsAmount,Investors,defaultstatus,EMI,ELA,InterestAmount,TotalAmount,ROI,PROI
0,1021339766868145413AB3B,193129,C,36,Completed,0.16516,0.1580,0.1380,0.166604,0.078594,...,0,0.0,258,0,1496.764570,63034.0,4467.450,13892.450,0.1580,0.13
1,10273602499503308B223C1,1209647,C,36,Current,0.12016,0.0920,0.0820,0.079600,0.024900,...,0,0.0,1,0,0.000000,43120.0,2760.000,12760.000,0.0920,0.23
2,0EE9337825851032864889A,81716,HR,36,Completed,0.28269,0.2750,0.2400,0.166604,0.078594,...,0,0.0,41,0,825.406311,32711.0,2475.825,5476.825,0.2750,0.13
3,0EF5356002482715299901A,658116,C,36,Current,0.12528,0.0974,0.0874,0.084900,0.024900,...,0,0.0,158,0,413.021192,45064.0,2922.000,12922.000,0.0974,0.13
4,0F023589499656230C5E3E2,909464,C,36,Current,0.24614,0.2085,0.1985,0.183160,0.092500,...,0,0.0,20,0,326.288414,127590.0,9382.500,24382.500,0.2085,0.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113932,E6D9357655724827169606C,753087,C,36,Current,0.22354,0.1864,0.1764,0.164900,0.069900,...,0,0.0,1,0,418.123199,77104.0,5592.000,15592.000,0.1864,0.23
113933,E6DB353036033497292EE43,537216,C,36,FinalPaymentInProgress,0.13220,0.1110,0.1010,0.100700,0.020000,...,0,0.0,22,0,226.812921,9992.0,666.000,2666.000,0.1110,0.08
113934,E6E13596170052029692BB1,1069178,C,60,Current,0.23984,0.2150,0.2050,0.188280,0.102500,...,0,0.0,119,0,39.377582,139000.0,10750.000,20750.000,0.2150,0.28
113935,E6EB3531504622671970D9E,539056,C,60,Completed,0.28408,0.2605,0.2505,0.244500,0.085000,...,0,0.0,274,0,3907.503625,249450.0,19537.500,34537.500,0.2605,0.18


# Data Encoding for Categorical data columns 

In [27]:
categorical_columns = df.select_dtypes(exclude=[float, int]).columns
#Data encoding using label encoder

# Exclude date columns from categorical columns
date_columns = ['DateCreditPulled', 'FirstRecordedCreditLine']
categorical_columns = df.select_dtypes(exclude=['float', 'int']).columns
categorical_columns = [col for col in categorical_columns if col not in date_columns]

# Use LabelEncoder for non-date categorical columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column].astype(str))


In [28]:
categorical_columns

['ListingKey',
 'CreditGrade',
 'LoanStatus',
 'ProsperRating (Alpha)',
 'BorrowerState',
 'Occupation',
 'EmploymentStatus',
 'IsBorrowerHomeowner',
 'CurrentlyInGroup',
 'GroupKey',
 'IncomeRange',
 'IncomeVerifiable',
 'LoanKey',
 'LoanOriginationQuarter',
 'MemberKey']

In [29]:
df

Unnamed: 0,ListingKey,ListingNumber,CreditGrade,LoanTenure,LoanStatus,BorrowerAPR,BorrowerRate,LenderYield,EstimatedEffectiveYield,EstimatedLoss,...,InvestmentFromFriendsCount,InvestmentFromFriendsAmount,Investors,defaultstatus,EMI,ELA,InterestAmount,TotalAmount,ROI,PROI
0,6571,193129,3,36,2,0.16516,0.1580,0.1380,0.166604,0.078594,...,0,0.0,258,0,1496.764570,63034.0,4467.450,13892.450,0.1580,0.13
1,6583,1209647,3,36,3,0.12016,0.0920,0.0820,0.079600,0.024900,...,0,0.0,1,0,0.000000,43120.0,2760.000,12760.000,0.0920,0.23
2,6084,81716,6,36,2,0.28269,0.2750,0.2400,0.166604,0.078594,...,0,0.0,41,0,825.406311,32711.0,2475.825,5476.825,0.2750,0.13
3,6102,658116,3,36,3,0.12528,0.0974,0.0874,0.084900,0.024900,...,0,0.0,158,0,413.021192,45064.0,2922.000,12922.000,0.0974,0.13
4,6119,909464,3,36,3,0.24614,0.2085,0.1985,0.183160,0.092500,...,0,0.0,20,0,326.288414,127590.0,9382.500,24382.500,0.2085,0.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113932,94097,753087,3,36,3,0.22354,0.1864,0.1764,0.164900,0.069900,...,0,0.0,1,0,418.123199,77104.0,5592.000,15592.000,0.1864,0.23
113933,94106,537216,3,36,5,0.13220,0.1110,0.1010,0.100700,0.020000,...,0,0.0,22,0,226.812921,9992.0,666.000,2666.000,0.1110,0.08
113934,94115,1069178,3,60,3,0.23984,0.2150,0.2050,0.188280,0.102500,...,0,0.0,119,0,39.377582,139000.0,10750.000,20750.000,0.2150,0.28
113935,94132,539056,3,60,2,0.28408,0.2605,0.2505,0.244500,0.085000,...,0,0.0,274,0,3907.503625,249450.0,19537.500,34537.500,0.2605,0.18


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105268 entries, 0 to 113936
Data columns (total 85 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ListingKey                           105268 non-null  int64  
 1   ListingNumber                        105268 non-null  int64  
 2   CreditGrade                          105268 non-null  int64  
 3   LoanTenure                           105268 non-null  int64  
 4   LoanStatus                           105268 non-null  int64  
 5   BorrowerAPR                          105268 non-null  float64
 6   BorrowerRate                         105268 non-null  float64
 7   LenderYield                          105268 non-null  float64
 8   EstimatedEffectiveYield              105268 non-null  float64
 9   EstimatedLoss                        105268 non-null  float64
 10  EstimatedReturn                      105268 non-null  float64
 11  ProsperRating (num

# Feature Engineering

In [31]:
df = df.drop(columns=['CreditGrade','ListingNumber',
       'ProsperRating (Alpha)','BorrowerState','ListingCategory (numeric)',
       'Occupation',
       'CurrentlyInGroup', 'GroupKey', 'FirstRecordedCreditLine', 
       'LoanKey','DateCreditPulled', 'FirstRecordedCreditLine','LoanStatus','LoanFirstDefaultedCycleNumber',
       'LP_GrossPrincipalLoss','LoanNumber'])

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105268 entries, 0 to 113936
Data columns (total 70 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ListingKey                           105268 non-null  int64  
 1   LoanTenure                           105268 non-null  int64  
 2   BorrowerAPR                          105268 non-null  float64
 3   BorrowerRate                         105268 non-null  float64
 4   LenderYield                          105268 non-null  float64
 5   EstimatedEffectiveYield              105268 non-null  float64
 6   EstimatedLoss                        105268 non-null  float64
 7   EstimatedReturn                      105268 non-null  float64
 8   ProsperRating (numeric)              105268 non-null  float64
 9   ProsperScore                         105268 non-null  float64
 10  EmploymentStatus                     105268 non-null  int64  
 11  EmploymentStatusDu

# Pipeline

In [33]:
# import pandas as pd
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.metrics import mean_squared_error, r2_score
# import dill

# # Custom transformer to fit and extract probabilities from binary classifier
# class BinaryProbTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, binary_classifier):
#         self.binary_classifier = binary_classifier

#     def fit(self, X, y=None):
#         # Note: Using y_binary_train here as it's the correct target for the binary classifier
#         self.binary_classifier.fit(X, y_binary_train)
#         return self

#     def transform(self, X):
#         return self.binary_classifier.predict_proba(X)[:, 1].reshape(-1, 1)

# # Assuming df is your DataFrame and binary_target, multiclass_targets are defined
# binary_target = 'defaultstatus'
# multiclass_targets = ['EMI', 'ELA', 'PROI']

# # Binary classification data preparation
# X_binary = df.drop(columns=[binary_target])
# y_binary = df[binary_target]

# # mi = mutual_info_classif(X_binary, y_binary, random_state=42)
# # mi_df = pd.DataFrame(mi, index=X_binary.columns, columns=['MI']).sort_values(by='MI', ascending=False)
# # top_features_binary = mi_df.head(10).index.tolist()
# top_features_binary = [
#     'StatedMonthlyIncome',
#     'LoanOriginalAmount',
#     'LoanTenure',
#     'ProsperScore',
#     'MonthlyLoanPayment',
#     'BorrowerRate',
#     'LoanCurrentDaysDelinquent',
#     'ProsperRating (numeric)',
#     'LP_CustomerPrincipalPayments',
#     'IncomeVerifiable',
#     'EmploymentStatus',
#     'DebtToIncomeRatio',
#     'IncomeRange' 
# ]
# X_binary_top = X_binary[top_features_binary]

# print("\nTop features based on mutual information for binary classification:\n", top_features_binary)

# X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(X_binary_top, y_binary, test_size=0.2, random_state=42)

# # Scaling the data
# scaler_binary = StandardScaler()
# X_binary_train_scaled = scaler_binary.fit_transform(X_binary_train)
# X_binary_test_scaled = scaler_binary.transform(X_binary_test)

# # Binary Classification Model
# binary_clf = LogisticRegression(random_state=42)

# # Create a pipeline for binary classification
# binary_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('binary_classifier', binary_clf)
# ])

# # Fit the binary pipeline
# binary_pipeline.fit(X_binary_train_scaled, y_binary_train)

# # Calculate the accuracy of the binary classification model
# binary_accuracy = binary_pipeline.score(X_binary_test_scaled, y_binary_test)
# print(f"Accuracy of the binary classification model: {binary_accuracy:.4f}")

# # Multiclass regression data preparation
# X_multi = df.drop(columns=multiclass_targets + [binary_target])
# y_multi = df[multiclass_targets]

# # mutual_info_df = pd.DataFrame(index=X_multi.columns, columns=y_multi.columns)
# # for target_col in y_multi.columns:
# #     mutual_info = mutual_info_regression(X_multi, y_multi[target_col], random_state=42)
# #     mutual_info_df[target_col] = mutual_info

# # mutual_info_df['Average Mutual Information'] = mutual_info_df.mean(axis=1)
# # mutual_info_df = mutual_info_df.sort_values(by='Average Mutual Information', ascending=False)
# # top_features_multi = mutual_info_df.head(10).index.tolist()
# top_features_multi = [
#     'StatedMonthlyIncome',
#     'LoanOriginalAmount',
#     'LoanTenure',
#     'ProsperScore',
#     'MonthlyLoanPayment',
#     'BorrowerRate',
#     'LoanCurrentDaysDelinquent',
#     'ProsperRating (numeric)',
#     'LP_CustomerPrincipalPayments',
#     'IncomeVerifiable',
#     'EmploymentStatus',
#     'DebtToIncomeRatio',
#     'IncomeRange' 
# ]
# X_multi_top = X_multi[top_features_multi]

# print("\nTop features based on mutual information for multiclass regression:\n", top_features_multi)

# X_multi_train, X_multi_test, y_multi_train, y_multi_test = train_test_split(X_multi_top, y_multi, test_size=0.2, random_state=42)

# # Define multiclass regressors
# multi_regs = {target: GradientBoostingRegressor(random_state=42) for target in multiclass_targets}

# overall_mse = {}
# overall_r2 = {}
# predictions = {}

# for target in multiclass_targets:
#     combined_pipeline = Pipeline([
#         ('features', FeatureUnion([
#             ('original_features', FunctionTransformer(lambda X: X)),  # Pass original features
#             ('binary_probabilities', BinaryProbTransformer(binary_pipeline))  # Add binary probabilities
#         ])),
#         ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
#         ('scaler', StandardScaler()),
#         ('regressor', multi_regs[target])
#     ])

#     # Fitting the pipeline with the correct target values for multiclass regression
#     combined_pipeline.fit(X_multi_train, y_multi_train[target])
#     y_multi_pred = combined_pipeline.predict(X_multi_test)
    
#     # Store predictions
#     predictions[target] = y_multi_pred

#     mse = mean_squared_error(y_multi_test[target], y_multi_pred)
#     r2 = r2_score(y_multi_test[target], y_multi_pred)

#     overall_mse[target] = mse
#     overall_r2[target] = r2

#     print(f"Combined Pipeline Multiclass Regression Report for {target}:")
#     print(f"Mean Squared Error: {mse:.4f}")
#     print(f"R^2 Score: {r2:.4f}")

# print("\nOverall Mean Squared Error and R^2 for each multiclass target:")
# for target in multiclass_targets:
#     print(f"{target}: MSE={overall_mse[target]:.4f}, R^2={overall_r2[target]:.4f}")

# # Save the pipeline for each target
# for target in multiclass_targets:
#     with open(f'combined_pipeline_{target}.pkl', 'wb') as f:
#         dill.dump(combined_pipeline, f)

# # Create DataFrame to compare test vs predictions
# comparison_df = pd.DataFrame()

# for target in multiclass_targets:
#     comparison_df[f'{target}_Actual'] = y_multi_test[target].values
#     comparison_df[f'{target}_Predicted'] = predictions[target]

# print("\nComparison DataFrame of Test vs Predictions:\n", comparison_df.head())

# # Save the comparison DataFrame
# comparison_df.to_csv('comparison_test_vs_predictions.csv', index=False)


In [34]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
import dill

# Assuming df is your DataFrame and binary_target, multiclass_targets are defined
binary_target = 'defaultstatus'
multiclass_targets = ['EMI', 'ELA', 'PROI']

# Binary classification data preparation
X_binary = df.drop(columns=[binary_target])
y_binary = df[binary_target]

top_features_binary = [
    'StatedMonthlyIncome',
    'LoanOriginalAmount',
    'LoanTenure',
    'ProsperScore',
    'MonthlyLoanPayment',
    'BorrowerRate',
    'LoanCurrentDaysDelinquent',
    'ProsperRating (numeric)',
    'IncomeVerifiable',
    'DebtToIncomeRatio',
    'LP_CustomerPrincipalPayments'
]
X_binary_top = X_binary[top_features_binary]

print("\nTop features based on mutual information for binary classification:\n", top_features_binary)

X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(X_binary_top, y_binary, test_size=0.2, random_state=42)

# Scaling the data
scaler_binary = StandardScaler()
X_binary_train_scaled = scaler_binary.fit_transform(X_binary_train)
X_binary_test_scaled = scaler_binary.transform(X_binary_test)

# Binary Classification Model
binary_clf = LogisticRegression(random_state=42)

# Create a pipeline for binary classification
binary_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('binary_classifier', binary_clf)
])

# Fit the binary pipeline
binary_pipeline.fit(X_binary_train_scaled, y_binary_train)

# Calculate the accuracy of the binary classification model
binary_accuracy = binary_pipeline.score(X_binary_test_scaled, y_binary_test)
print(f"Accuracy of the binary classification model: {binary_accuracy:.4f}")

# Multiclass regression data preparation
X_multi = df.drop(columns=multiclass_targets + [binary_target])
y_multi = df[multiclass_targets]

top_features_multi = [
    'StatedMonthlyIncome',
    'LoanOriginalAmount',
    'LoanTenure',
    'ProsperScore',
    'MonthlyLoanPayment',
    'BorrowerRate',
    'LoanCurrentDaysDelinquent',
    'ProsperRating (numeric)',
    'IncomeVerifiable',
    'DebtToIncomeRatio',
    'LP_CustomerPrincipalPayments'
]
X_multi_top = X_multi[top_features_multi]

print("\nTop features based on mutual information for multiclass regression:\n", top_features_multi)

X_multi_train, X_multi_test, y_multi_train, y_multi_test = train_test_split(X_multi_top, y_multi, test_size=0.2, random_state=42)

# Custom transformer to fit and extract probabilities from binary classifier
class BinaryProbTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, binary_classifier):
        self.binary_classifier = binary_classifier

    def fit(self, X, y=None):
        self.binary_classifier.fit(X, y_binary_train)
        return self

    def transform(self, X):
        return self.binary_classifier.predict_proba(X)[:, 1].reshape(-1, 1)

# Define multiclass regressors
multi_regs = {target: GradientBoostingRegressor(random_state=42) for target in multiclass_targets}

overall_mse = {}
overall_r2 = {}
predictions = {}

for target in multiclass_targets:
    combined_pipeline = Pipeline([
        ('features', FeatureUnion([
            ('original_features', FunctionTransformer(lambda X: X)),  # Pass original features
            ('binary_probabilities', BinaryProbTransformer(binary_pipeline))  # Add binary probabilities
        ])),
        ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
        ('scaler', StandardScaler()),
        ('regressor', multi_regs[target])
    ])

    # Fitting the pipeline with the correct target values for multiclass regression
    combined_pipeline.fit(X_multi_train, y_multi_train[target])
    y_multi_pred = combined_pipeline.predict(X_multi_test)
    
    # Store predictions
    predictions[target] = y_multi_pred

    mse = mean_squared_error(y_multi_test[target], y_multi_pred)
    r2 = r2_score(y_multi_test[target], y_multi_pred)

    overall_mse[target] = mse
    overall_r2[target] = r2

    print(f"Combined Pipeline Multiclass Regression Report for {target}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R^2 Score: {r2:.4f}")

    # Save the pipeline for each target after fitting
    with open(f'combined_pipeline_{target}.pkl', 'wb') as f:
        dill.dump(combined_pipeline, f)

print("\nOverall Mean Squared Error and R^2 for each multiclass target:")
for target in multiclass_targets:
    print(f"{target}: MSE={overall_mse[target]:.4f}, R^2={overall_r2[target]:.4f}")

# Create DataFrame to compare test vs predictions
comparison_df = pd.DataFrame()

for target in multiclass_targets:
    comparison_df[f'{target}_Actual'] = y_multi_test[target].values
    comparison_df[f'{target}_Predicted'] = predictions[target]

print("\nComparison DataFrame of Test vs Predictions:\n", comparison_df.head())

# Save the comparison DataFrame
comparison_df.to_csv('comparison_test_vs_predictions.csv', index=False)

# Function to load a pipeline for a specific target
def load_pipeline(target):
    with open(f'combined_pipeline_{target}.pkl', 'rb') as f:
        return dill.load(f)

# Example usage to load and predict for a specific target
loaded_pipelines = {}
predictions_new = {}

# Load the saved pipelines
for target in multiclass_targets:
    loaded_pipelines[target] = load_pipeline(target)

# Use the loaded pipelines to make predictions on new data (assuming X_multi_test as new data here)
for target in multiclass_targets:
    y_new_pred = loaded_pipelines[target].predict(X_multi_test)
    predictions_new[target] = y_new_pred
    print(f"Predictions for {target}:\n", y_new_pred[:10])  # Display first 10 predictions for each target

# Predict binary outcomes on the test set
y_binary_pred = binary_pipeline.predict(X_binary_test_scaled)

# Create a DataFrame to compare actual and predicted binary classification values
comparison_binary_df = pd.DataFrame({
    'Actual': y_binary_test,
    'Predicted': y_binary_pred
})

# Calculate accuracy of the binary classification model
binary_accuracy = accuracy_score(y_binary_test, y_binary_pred)

# Print the comparison DataFrame
print("\nComparison DataFrame of Actual vs Predicted for Binary Classification:\n", comparison_binary_df.head())

# Print accuracy, confusion matrix, and classification report
print(f"Accuracy of the binary classification model: {binary_accuracy:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_binary_test, y_binary_pred))
print("\nClassification Report:\n", classification_report(y_binary_test, y_binary_pred))

# Save the comparison DataFrame
comparison_binary_df.to_csv('comparison_binary_actual_vs_predicted.csv', index=False)



Top features based on mutual information for binary classification:
 ['StatedMonthlyIncome', 'LoanOriginalAmount', 'LoanTenure', 'ProsperScore', 'MonthlyLoanPayment', 'BorrowerRate', 'LoanCurrentDaysDelinquent', 'ProsperRating (numeric)', 'IncomeVerifiable', 'DebtToIncomeRatio', 'LP_CustomerPrincipalPayments']
Accuracy of the binary classification model: 0.9997

Top features based on mutual information for multiclass regression:
 ['StatedMonthlyIncome', 'LoanOriginalAmount', 'LoanTenure', 'ProsperScore', 'MonthlyLoanPayment', 'BorrowerRate', 'LoanCurrentDaysDelinquent', 'ProsperRating (numeric)', 'IncomeVerifiable', 'DebtToIncomeRatio', 'LP_CustomerPrincipalPayments']
Combined Pipeline Multiclass Regression Report for EMI:
Mean Squared Error: 72.0185
R^2 Score: 0.9999
Combined Pipeline Multiclass Regression Report for ELA:
Mean Squared Error: 3097205.8294
R^2 Score: 0.9992
Combined Pipeline Multiclass Regression Report for PROI:
Mean Squared Error: 0.0000
R^2 Score: 0.9960

Overall Me

In [42]:
X_multi_test.head(10)

Unnamed: 0,StatedMonthlyIncome,LoanOriginalAmount,LoanTenure,ProsperScore,MonthlyLoanPayment,BorrowerRate,LoanCurrentDaysDelinquent,ProsperRating (numeric),IncomeVerifiable,DebtToIncomeRatio,LP_CustomerPrincipalPayments
49554,9333.333333,10500,60,5.0,249.79,0.15,0,5.0,0,0.27,233.52
44372,3666.25,14000,36,10.0,438.64,0.0799,0,7.0,0,0.14,14000.0
111344,4490.0,3500,36,6.06308,0.0,0.27,532,4.13194,0,0.15,1306.47
92009,2583.333333,2500,36,6.06308,94.19,0.21,0,4.13194,0,0.08,2500.0
104641,2400.0,4000,36,2.0,169.78,0.2999,0,1.0,0,0.32,369.87
58778,3916.666667,5000,60,5.0,157.8,0.287,0,3.0,0,0.15,1087.09
55960,4992.583333,10000,36,6.06308,392.33,0.24,0,4.13194,0,0.14,10000.0
71363,8333.333333,1000,36,10.0,30.2,0.055,0,7.0,0,0.03,999.99
103696,2116.666667,1150,36,6.06308,52.02,0.35,0,4.13194,0,0.25,1149.94
99209,1041.666667,2000,36,8.0,72.29,0.1799,0,5.0,0,0.12,1487.46


In [35]:
label_encoder = LabelEncoder()

# Fit LabelEncoder to 'EmploymentStatus' column
encoded_labels = label_encoder.fit_transform(df['EmploymentStatus'])

# Print encoded labels
print("Encoded labels:")
print(encoded_labels)

# Print encoded mapping
encoded_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nEncoded mapping:")
print(encoded_mapping)

Encoded labels:
[7 0 2 ... 0 1 0]

Encoded mapping:
{np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2), np.int64(3): np.int64(3), np.int64(4): np.int64(4), np.int64(5): np.int64(5), np.int64(6): np.int64(6), np.int64(7): np.int64(7)}


In [36]:
df['BorrowerRate'].head()

0    0.1580
1    0.0920
2    0.2750
3    0.0974
4    0.2085
Name: BorrowerRate, dtype: float64

In [37]:
df['ROI'].head()

0    0.1580
1    0.0920
2    0.2750
3    0.0974
4    0.2085
Name: ROI, dtype: float64

In [38]:
df['EMI'].head()


0    1496.764570
1       0.000000
2     825.406311
3     413.021192
4     326.288414
Name: EMI, dtype: float64

In [39]:
df['StatedMonthlyIncome'].head()

0    3083.333333
1    6125.000000
2    2083.333333
3    2875.000000
4    9583.333333
Name: StatedMonthlyIncome, dtype: float64

In [40]:
df['DebtToIncomeRatio'].describe()

count    105268.000000
mean          0.258845
std           0.371385
min           0.000000
25%           0.140000
50%           0.220000
75%           0.310000
max          10.010000
Name: DebtToIncomeRatio, dtype: float64

In [41]:
df['EMI'].head(5)/df['StatedMonthlyIncome'].head(5)

0    0.485437
1    0.000000
2    0.396195
3    0.143660
4    0.034047
dtype: float64