In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

In [40]:
path = 'DataMarch.csv'
df = pd.read_csv(path)



In [41]:
df

Unnamed: 0,Age,Gender,AppliedAmount,Amount,Interest,LoanDuration,MonthlyPayment,UseOfLoan,Education,MaritalStatus,...,Target,Status,PrincipalWriteOffs,InterestAndPenaltyWriteOffs,PrincipalDebtServicingCost,InterestAndPenaltyDebtServicingCost,EMI,Risk,ELA,ROI
0,20,0.0,319.5582,319.56,40.00,1,0.00,0,3.0,3.0,...,1,1,0.00,0.00,0.0,0.00,32.745603,0.5,0.240099,0.009067
1,49,1.0,319.5582,319.56,30.00,20,0.00,2,2.0,4.0,...,1,1,0.00,0.00,0.0,0.00,8.010331,0.5,0.123593,0.006730
2,19,0.0,319.5582,70.30,49.00,2,0.00,3,2.0,2.0,...,1,1,0.00,0.00,0.0,0.00,21.138012,0.5,0.156880,0.002239
3,41,0.0,191.7349,191.75,30.00,18,0.00,3,5.0,1.0,...,1,1,0.00,0.00,0.0,54.14,4.816621,0.5,0.214272,0.003926
4,19,0.0,115.0410,19.17,49.00,1,0.00,2,2.0,2.0,...,1,1,0.00,0.00,0.0,0.00,12.317211,0.5,0.137941,0.000406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27842,37,0.0,3000.0000,3000.00,31.01,60,107.68,1,3.0,3.0,...,1,2,691.81,1552.18,0.0,1040.06,77.525001,0.5,0.093176,0.067752
27843,35,0.0,3000.0000,3000.00,18.25,60,85.33,3,3.0,3.0,...,0,2,0.00,0.00,0.0,0.00,45.625870,0.5,0.090808,0.039758
27844,40,0.0,2500.0000,2500.00,24.83,60,80.42,1,3.0,1.0,...,1,1,0.00,0.00,0.0,1051.85,51.729187,0.5,0.094610,0.045115
27845,47,1.0,3000.0000,3000.00,17.74,60,84.51,2,5.0,1.0,...,1,2,742.46,4829.92,0.0,889.50,44.351143,0.0,0.086908,0.038639


In [42]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['EMI', 'ELA', 'ROI'], axis=1), df[['EMI', 'ELA', 'ROI']], test_size=0.2, random_state=42)



In [43]:
# Define the preprocessing steps for the numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])



In [44]:
# Combine the preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Amount', 'Interest', 'LoanDuration', 'IncomeTotal', 'LiabilitiesTotal', 'DebtToIncome', 'FreeCash']),
        ('cat', categorical_transformer, ['Gender', 'Status', 'Education', 'EmploymentStatus', 'MaritalStatus', 'HomeOwnershipType', 'UseOfLoan'])
    ])



# Linear Regressor

In [47]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Define the regression model
model = LinearRegression()

# Define the pipeline
Reg23 = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Fit the pipeline on the training data
Reg23.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Evaluate the performance of the model using mean squared error and R-squared
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))




Mean squared error: 847.7643050089245
R-squared: 0.825488092674902


In [48]:
with open('reg23','wb') as f:
    pickle.dump(Reg23,f) 

# Gradient Boosting Classifier

In [25]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def fit_and_evaluate(X_train, y_train, X_test, y_test, target):
    y_train_target = y_train[target]
    y_test_target = y_test[target]

    Classification_Pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', GradientBoostingRegressor(random_state=0))
    ])

    Classification_Pipeline.fit(X_train, y_train_target)
    y_pred = Classification_Pipeline.predict(X_test)

    mse = mean_squared_error(y_test_target, y_pred)
    r2 = r2_score(y_test_target, y_pred)

    print(f"{target}:\nMean squared error: {mse}\nR-squared: {r2}\n")


# Run the function for each target variable
fit_and_evaluate(X_train, y_train, X_test, y_test, 'ELA')
fit_and_evaluate(X_train, y_train, X_test, y_test, 'ROI')
fit_and_evaluate(X_train, y_train, X_test, y_test, 'EMI')



ELA:
Mean squared error: 1.7019499464407497e-05
R-squared: 0.8993888538288323

ROI:
Mean squared error: 1.1660968231033118e-05
R-squared: 0.9953899590356298

EMI:
Mean squared error: 34.238379103719396
R-squared: 0.9947316685474115



# Save the model

In [50]:


# Define the three GradientBoostingRegressor models for each target variable
models = {
    'ELA': GradientBoostingRegressor(random_state=0),
    'ROI': GradientBoostingRegressor(random_state=0),
    'EMI': GradientBoostingRegressor(random_state=0)
}

# Create a dictionary to store the model pipelines
model_pipelines = {}

# Loop through the models and create a pipeline for each one
for target_var, model in models.items():
    # select only the target variable
    y_train_ela = y_train[target_var]
    y_test_ela = y_test[target_var]

    # create a pipeline
    model_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # fit the model
    model_pipeline.fit(X_train, y_train_ela)

    # add the pipeline to the dictionary
    model_pipelines[target_var] = model_pipeline

# Save the model pipelines into a single pickle file

    
with open('class23','wb') as f:
    pickle.dump(model_pipelines,f)     
    


In [30]:
from sklearn.linear_model import LogisticRegression