In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

In [4]:
path = 'DataMarch.csv'
df = pd.read_csv(path)



In [124]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['EMI', 'ELA', 'ROI', 'Risk', 'Target'], axis=1), df[['EMI', 'ELA', 'ROI', 'Risk', 'Target']], test_size=0.2, random_state=42)



In [6]:
# Define the preprocessing steps for the numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])



In [83]:
df.columns

Index(['Age', 'Gender', 'AppliedAmount', 'Amount', 'Interest', 'LoanDuration',
       'MonthlyPayment', 'UseOfLoan', 'Education', 'MaritalStatus',
       'EmploymentStatus', 'WorkExperience', 'HomeOwnershipType',
       'IncomeTotal', 'LiabilitiesTotal', 'DebtToIncome', 'FreeCash', 'Target',
       'Status', 'PrincipalWriteOffs', 'InterestAndPenaltyWriteOffs',
       'PrincipalDebtServicingCost', 'InterestAndPenaltyDebtServicingCost',
       'EMI', 'Risk', 'ELA', 'ROI'],
      dtype='object')

In [104]:
# Combine the preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['AppliedAmount', 'Amount', 'Interest', 'LoanDuration', 'MonthlyPayment', 'IncomeTotal', 'LiabilitiesTotal']),
        ('cat', categorical_transformer, ['Target'])
    ])



# Random Forest Regressor

In [105]:
# Define the regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)



In [106]:
# Define the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])



In [107]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)




Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['AppliedAmount', 'Amount',
                                                   'Interest', 'LoanDuration',
                                                   'MonthlyPayment',
                                                   'IncomeTotal',
                                                   'LiabilitiesTotal']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Target'])])),
                ('model', RandomForestRegressor(random_state=42))])

In [108]:
# Make predictions on the testing data
y_pred = pipeline.predict(X_test)



In [109]:
# Evaluate the performance of the model using mean squared error and R-squared
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

Mean squared error: 5.964252668523056
R-squared: 0.6325305576245034


# Gradient Boosting Classifier

In [110]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor



### Gradient Boosting Classifier for the 'ELA' variable

In [118]:
# select only the 'ELA' target variable
y_train_ela = y_train['ELA']
y_test_ela = y_test['ELA']

# create a pipeline
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model
model_pipeline.fit(X_train, y_train_ela)

# make predictions
y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test_ela, y_pred_ela)
r2 = r2_score(y_test_ela, y_pred_ela)
print('Mean squared error:', mse)
print('R-squared:', r2)


Mean squared error: 1.4961682267051221e-05
R-squared: 0.9098979462279443


### Gradient Boosting Classifier for the 'ROI' variable

In [119]:
# select only the 'ROI' target variable
y_train_ela = y_train['ROI']
y_test_ela = y_test['ROI']

# create a pipeline
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model
model_pipeline.fit(X_train, y_train_ela)

# make predictions
y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test_ela, y_pred_ela)
r2 = r2_score(y_test_ela, y_pred_ela)
print('Mean squared error:', mse)
print('R-squared:', r2)


Mean squared error: 2.1785148654781477e-05
R-squared: 0.9915216929446455


### Gradient Boosting Classifier for the 'EMI' variable

In [120]:
# select only the 'EMI' target variable
y_train_ela = y_train['EMI']
y_test_ela = y_test['EMI']

# create a pipeline
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model
model_pipeline.fit(X_train, y_train_ela)

# make predictions
y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test_ela, y_pred_ela)
r2 = r2_score(y_test_ela, y_pred_ela)
print('Mean squared error:', mse)
print('R-squared:', r2)


Mean squared error: 59.43632475167931
R-squared: 0.9922139990632141


### Gradient Boosting Classifier for the 'Target' variable

In [125]:
# select only the 'Target' target variable
y_train_ela = y_train['Target']
y_test_ela = y_test['Target']

# create a pipeline
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model
model_pipeline.fit(X_train, y_train_ela)

# make predictions
y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test_ela, y_pred_ela)
r2 = r2_score(y_test_ela, y_pred_ela)
print('Mean squared error:', mse)
print('R-squared:', r2)


Mean squared error: 0.02835079415142393
R-squared: 0.8841714688019155


# Save the model

In [131]:
import joblib
import pickle

In [133]:

# Save the Gradient Boosting model
with open('gb_model.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

# Save the Random Forest model
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
