In [65]:
import pandas as pd
import numpy as np
import joblib
import os
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import pickle
#from django.conf import settings

credit_card_df = pd.read_csv('Credit_card.csv')
daily_household_transactions_df = pd.read_csv('Daily Household Transactions.csv')
loan_approval_df = pd.read_csv('loan_approval_dataset.csv')  # Load the loan approval dataset

# Define productive and non-productive categories
productive_categories = [
    'Transportation', 'Education', 'Healthcare', 'Groceries', 'Investment',
    'Utilities', 'Rent', 'Insurance', 'Savings', 'Household',
    'Public Provident Fund', 'Life Insurance', 'Interest', 'Tax refund',
    'Fixed Deposit', 'Recurring Deposit'
]
non_productive_categories = [
    'Entertainment', 'Dining out', 'Subscription', 'Luxury', 'Gambling',
    'Alcohol', 'Tobacco', 'Cosmetics', 'Fashion', 'Leisure',
    'Festivals', 'Apparel', 'Gift', 'Social Life', 'Tourism',
    'Beauty', 'Grooming'
]

# Redefine the function to label transactions
def label_transaction(row):
    if row['Category'] in productive_categories:
        return 'Productive'
    elif row['Category'] in non_productive_categories:
        return 'Non-Productive'
    else:
        return 'Unknown'

# Apply the labeling function
daily_household_transactions_df['Label'] = daily_household_transactions_df.apply(label_transaction, axis=1)

# Drop rows with 'Unknown' label
daily_household_transactions_df = daily_household_transactions_df[daily_household_transactions_df['Label'] != 'Unknown']

# Assign synthetic Ind_ID to daily household transactions
np.random.seed(42)  # for reproducibility
num_unique_ids = credit_card_df['Ind_ID'].nunique()
synthetic_ids = np.random.choice(credit_card_df['Ind_ID'].unique(), len(daily_household_transactions_df))

daily_household_transactions_df['Ind_ID'] = synthetic_ids

# Aggregate transaction data by synthetic Ind_ID
agg_transactions_df = daily_household_transactions_df.groupby('Ind_ID').agg({
    'Amount': ['sum', 'mean', 'std'],
    'Label': lambda x: (x == 'Productive').sum() / len(x)  # ratio of productive transactions
}).reset_index()

# Flatten the column hierarchy
agg_transactions_df.columns = ['Ind_ID', 'Total_Amount', 'Mean_Amount', 'Std_Amount', 'Productive_Ratio']

# Merge aggregated transaction data with credit card data
merged_df = pd.merge(credit_card_df, agg_transactions_df, on='Ind_ID', how='inner')

# Assuming there is no direct match to merge with the loan approval dataset, we will not merge but handle them separately

# Feature Engineering: Handle missing values and encode categorical variables

# Fill missing values for numerical columns with their mean
numerical_cols = ['Annual_income', 'Birthday_count', 'Std_Amount']
for col in numerical_cols:
    merged_df[col].fillna(merged_df[col].mean(), inplace=True)

# Fill missing values for categorical columns with the mode
categorical_cols = ['GENDER', 'Type_Occupation']
for col in categorical_cols:
    merged_df[col].fillna(merged_df[col].mode()[0], inplace=True)

# Encode categorical variables using one-hot encoding
merged_df = pd.get_dummies(merged_df, columns=['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 'EDUCATION',
                                               'Marital_status', 'Housing_type', 'Type_Occupation'], drop_first=True)

# Select features and target variable
features = merged_df.drop(columns=['Ind_ID'])
target = merged_df['Productive_Ratio'].apply(lambda x: 1 if x > 0.5 else 0)  # Binary target based on productive ratio

# Split the data into training and testing sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, stratify=target, random_state=42)

# Function to evaluate models using cross-validation
def evaluate_model(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return np.mean(scores), np.std(scores)

# Initialize models
logistic_regression = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Evaluate models
logistic_regression_score = evaluate_model(logistic_regression, X_train, y_train)
decision_tree_score = evaluate_model(decision_tree, X_train, y_train)
random_forest_score = evaluate_model(random_forest, X_train, y_train)

# Print evaluation scores
print("Logistic Regression Score:", logistic_regression_score)
print("Decision Tree Score:", decision_tree_score)
print("Random Forest Score:", random_forest_score)

# Train the best performing model (Random Forest in this case)
best_model = random_forest
best_model.fit(X_train, y_train)

# Feature importance analysis
feature_importances = best_model.feature_importances_
features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
features_df = features_df.sort_values(by='Importance', ascending=False)

print(features_df.head(10))


# Load the trained model
# model_filename = 'finertia/MLModel/model.pkl'
# best_model = joblib.load(model_filename)

def classify_financial_status_and_suggest_plan(form_data):
    # Convert form data to the same format used for model training
    input_data = {
        'Annual_income': form_data['annual_income'],
        'Birthday_count': form_data['birthday_count'],
        'Employed_days': form_data['employed_days'],
        'Mobile_phone': int(form_data['mobile_phone']),
        'Work_Phone': int(form_data['work_phone']),
        'Phone': int(form_data['phone']),
        'EMAIL_ID': int(form_data['email_id']),
        'Family_Members': form_data['family_members'],
        'Total_Amount': form_data['total_amount'],
        'Mean_Amount': form_data['mean_amount'],
        'Std_Amount': form_data['std_amount'],
        'Productive_Ratio': form_data['productive_ratio'],
        'GENDER_F': int(form_data['gender_f']),
        'Car_Owner_Y': int(form_data['car_owner_y']),
        'Propert_Owner_Y': int(form_data['propert_owner_y']),
        'Type_Income_' + form_data['type_income']: 1,
        'EDUCATION_' + form_data['education']: 1,
        'Marital_status_' + form_data['marital_status']: 1,
        'Housing_type_' + form_data['housing_type']: 1,
        'Type_Occupation_' + form_data['type_occupation']: 1
    }

#     sample_input = pd.DataFrame({
#     ' no_of_dependents': [2],
#     ' education': [0],
#     ' self_employed': [0],
#     ' income_annum': [960000],
#     ' cibil_score': [778],
#     ' residential_assets_value': [2400],
#     ' commercial_assets_value': [1760090],
#     ' luxury_assets_value': [227000],
#     ' bank_asset_value': [80000]
# })

    regression_input_data = {
        ' no_of_dependents': form_data['family_members'].
        ' education': + form_data['education']: 1,
        ' self_employed': [0],
        ' income_annum': form_data['annual_income'],
        ' cibil_score': cibil_score,
        ' residential_assets_value': form_data['residential_assets_value'],
        ' commercial_assets_value': form_data['commercial_assets_value'],
        ' luxury_assets_value': form_data['luxury_assets_value'],
        ' bank_asset_value': form_data['bank_asset_value']

    }

    # Ensure all necessary columns are present
    missing_cols = set(features.columns) - set(input_data.keys())
    for col in missing_cols:
        input_data[col] = 0

    input_df = pd.DataFrame([input_data])

    # Reorder columns to match the training data
    input_df = input_df[features.columns]

    # Predict using the trained model
    prediction = best_model.predict(input_df)
    stability = 'Financially Stable' if prediction[0] == 1 else 'Not Financially Stable'

    # Determine loan eligibility and suggested loan amount
    cibil_score = form_data['cibil_score']
    bank_assets_value = form_data['bank_assets_value']
    loan_eligibility = 'Eligible'
    if stability == 'Financially Stable' and cibil_score > 650:
        loan_eligibility = 'Eligible'
        suggested_loan_amount = (0.2 * form_data['annual_income'] + 0.5 * bank_assets_value) / 2
    else:
        loan_eligibility = 'Not Eligible'
        suggested_loan_amount = 0



    # Generate a dynamic step-by-step financial plan
    steps = []
    if stability == 'Not Financially Stable':
        steps.append("1. **Reduce Non-Productive Expenses:** Focus on cutting down spending in non-essential categories.")
        if form_data['productive_ratio'] < 0.3:
            steps.append("2. **Increase Productive Spending:** Ensure essential needs like healthcare and education are prioritized.")
        if form_data['annual_income'] < 30000:
            steps.append("3. **Increase Income:** Consider strategies like upskilling, taking up a side job, or seeking a raise.")
        if bank_assets_value < 5000:
            steps.append("4. **Build Savings:** Start by setting aside a small portion of your income each month to build an emergency fund.")
        if cibil_score < 650:
            steps.append("5. **Improve Credit Score:** Pay off outstanding debts, avoid late payments, and reduce credit utilization.")
        steps.append("6. **Track and Monitor:** Regularly review your expenses and savings. Use budgeting tools or apps to keep track of your financial progress.")

    else:
        steps.append("1. **Maintain Financial Stability:** Continue with your current financial habits to maintain stability.")
        if cibil_score < 700:
            steps.append("2. **Improve Credit Score:** Even though you are financially stable, a higher credit score can provide better loan options. Consider reducing credit card balances and ensuring timely payments.")
        if form_data['total_amount'] > form_data['annual_income'] * 0.5:
            steps.append("3. **Optimize Spending:** Your current expenses are over half of your income. Consider optimizing your spending to ensure more is directed towards savings and investments.")
        steps.append("4. **Invest for the Future:** Explore investment options like retirement accounts, mutual funds, or low-risk savings plans to grow your wealth.")
        steps.append("5. **Plan for Long-Term Goals:** Start planning for significant financial goals such as buying a house, funding education, or retirement.")

    # Convert the steps list into a readable format
    plan_text = "\n".join(steps)

    return stability, loan_eligibility, suggested_loan_amount, plan_text

SyntaxError: invalid syntax (<ipython-input-65-655323a1e3fa>, line 164)

In [8]:
print(loan_approval_df.columns)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [74]:
loan_approval_df = pd.read_csv('loan_approval_dataset.csv')  # Load the loan approval datase
display(loan_approval_df.columns)
# loan_approval_df = loan_approval_df[loan_approval_df[' loan_status'].str.contains('Approved')]
# display(loan_approval_df)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [72]:
loan_approval_df = pd.read_csv('loan_approval_dataset.csv')  # Load the loan approval datase

In [73]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import pickle


loan_approval_df = pd.read_csv('loan_approval_dataset.csv')  # Load the loan approval datase
# Step 3: Data exploration and cleaning
# Display the first few rows of the dataset
print(loan_approval_df.head())

# Check for missing values
print(loan_approval_df.isnull().sum())
loan_approval_df = loan_approval_df[loan_approval_df[' loan_status'].str.contains('Approved')]
display(loan_approval_df)
loan_approval_df = loan_approval_df.drop(' loan_status', axis=1)

# Handle missing values if any (e.g., drop or fill with mean/median)
loan_approval_df = loan_approval_df.dropna()  # or use data.fillna(method='ffill') or data.fillna(data.mean())

# Step 4: Feature encoding
# Strip leading/trailing spaces from column names
loan_approval_df.columns = loan_approval_df.columns.str.strip()

# Encode categorical variables
label_encoders = {}
for column in ['education', 'self_employed']:
    le = LabelEncoder()
    loan_approval_df[column] = le.fit_transform(loan_approval_df[column])
    label_encoders[column] = le

# Step 5: Define features and target variables
features = loan_approval_df.drop(columns=['loan_id', 'loan_amount', 'loan_term'])
target_loan_amount = loan_approval_df['loan_amount']
target_loan_term = loan_approval_df['loan_term']

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train_amount, y_test_amount = train_test_split(features, target_loan_amount, test_size=0.2, random_state=42)
_, _, y_train_term, y_test_term = train_test_split(features, target_loan_term, test_size=0.2, random_state=42)

display(X_train)

# Step 7: Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
display(X_train)
X_test = scaler.transform(X_test)

# Step 8: Train the linear regression models
model_amount = LogisticRegression()
model_amount.fit(X_train, y_train_amount)

model_term = LogisticRegression()
model_term.fit(X_train, y_train_term)

# Step 9: Predict on the test set
y_pred_amount = model_amount.predict(X_test)
y_pred_term = model_term.predict(X_test)

# Load the trained model
with open('best_model_amount.pkl', 'wb') as f:
    pickle.dump(best_model_amount, f)

with open('best_model_term.pkl', 'wb') as f:
    pickle.dump(best_model_term, f)

# Step 10: Evaluate the models
mse_amount = mean_squared_error(y_test_amount, y_pred_amount)
r2_amount = r2_score(y_test_amount, y_pred_amount)
print(f'Loan Amount - Mean Squared Error: {mse_amount}')
print(f'Loan Amount - R^2 Score: {r2_amount}')

mse_term = mean_squared_error(y_test_term, y_pred_term)
r2_term = r2_score(y_test_term, y_pred_term)
print(f'Loan Term - Mean Squared Error: {mse_term}')
print(f'Loan Term - R^2 Score: {r2_term}')

loan_amount_score = evaluate_model(model_amount, X_train, y_train_amount)
loan_term_score = evaluate_model(model_term, X_train, y_train_term)
print("Logistic Regression Loan Amount Score:", loan_amount_score)
print("Logistic Regression Loan Term Score:", loan_term_score)

# Step 11: Visualize the results
# plt.figure(figsize=(12, 6))

# plt.subplot(1, 2, 1)
# plt.scatter(y_test_amount, y_pred_amount, alpha=0.5)
# plt.xlabel('Actual Loan Amount')
# plt.ylabel('Predicted Loan Amount')
# plt.title('Actual vs Predicted Loan Amount')

# plt.subplot(1, 2, 2)
# plt.scatter(y_test_term, y_pred_term, alpha=0.5)
# plt.xlabel('Actual Loan Term')
# plt.ylabel('Predicted Loan Term')
# plt.title('Actual vs Predicted Loan Term')

# plt.tight_layout()
# plt.show()



   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   
2        3                  3       Graduate             No        9100000   
3        4                  3       Graduate             No        8200000   
4        5                  5   Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0                  1760000

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
6,7,5,Graduate,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
8,9,0,Graduate,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
10,11,4,Graduate,Yes,2900000,11200000,2,547,8100000,4700000,9500000,3100000,Approved
13,14,2,Graduate,Yes,9100000,31500000,14,679,10800000,16600000,20900000,5000000,Approved
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4261,4262,3,Graduate,Yes,3000000,7500000,6,881,1400000,4500000,6100000,2300000,Approved
4263,4264,3,Graduate,No,5000000,12700000,14,865,4700000,8100000,19500000,6300000,Approved
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
1445,1,0,1,1800000,689,3200000,2500000,3800000,1600000
3635,3,1,1,5400000,512,1600000,3400000,13100000,6000000
1825,3,1,0,8100000,606,900000,1100000,31900000,11600000
4223,0,1,0,2700000,610,900000,1900000,6100000,2800000
2618,2,1,1,3800000,730,10800000,2900000,10700000,3600000
...,...,...,...,...,...,...,...,...,...
2663,2,0,0,4300000,871,4100000,7300000,13900000,4400000
1761,2,1,1,9800000,304,4700000,19000000,24500000,8600000
1812,5,1,0,9000000,437,18400000,17800000,18300000,10400000
2110,2,0,1,1100000,806,2700000,800000,2900000,500000


array([[-0.859179  , -0.98783296,  1.00851079, ..., -0.57409991,
        -1.2277313 , -1.0340849 ],
       [ 0.31250395,  1.0123169 ,  1.00851079, ..., -0.36880802,
        -0.21424036,  0.30873546],
       [ 0.31250395,  1.0123169 , -0.99156103, ..., -0.89344285,
         1.83453702,  2.01777954],
       ...,
       [ 1.4841869 ,  1.0123169 , -0.99156103, ...,  2.91586222,
         0.35244274,  1.65155581],
       [-0.27333752, -0.98783296,  1.00851079, ..., -0.96187348,
        -1.32581107, -1.36978998],
       [-1.44502047,  1.0123169 , -0.99156103, ...,  1.98064361,
         0.84284158,  0.6139219 ]])

Loan Amount - Mean Squared Error: 21516240601503.758
Loan Amount - R^2 Score: 0.7496704672665094
Loan Term - Mean Squared Error: 63.86466165413534
Loan Term - R^2 Score: -0.8247112500913605




Logistic Regression Loan Amount Score: (0.0150677025527192, 0.00624843562161622)
Logistic Regression Loan Term Score: (0.13842508324084352, 0.011072319403108506)


In [48]:
loan_approval_df = pd.read_csv('loan_approval_dataset.csv')  # Load the loan approval datase

In [50]:
loan_approval_df = loan_approval_df[loan_approval_df[' loan_status'].str.contains('Approved')]
display(loan_approval_df)
print(loan_approval_df.columns)
categorical_features = [' education', ' self_employed']
numerical_features = [' no_of_dependents', ' income_annum', ' cibil_score',
                      ' residential_assets_value', ' commercial_assets_value',
                      ' luxury_assets_value', ' bank_asset_value']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])
#loan_approval_df = loan_approval_df.drop(' loan_status', axis=1)

# Define features and target variables
X = loan_approval_df.drop(columns=['loan_id', ' loan_amount', ' loan_term', ' loan_status'])
y_loan_amount = loan_approval_df[' loan_amount']
y_loan_term = loan_approval_df[' loan_term']

# Apply log transformation to loan amount
y_loan_amount_log = np.log1p(y_loan_amount)

# Split the data into training and testing sets
X_train, X_test, y_train_amount, y_test_amount = train_test_split(X, y_loan_amount_log, test_size=0.2, random_state=42)
_, _, y_train_term, y_test_term = train_test_split(X, y_loan_term, test_size=0.2, random_state=42)

# Pipeline and Grid Search for Loan Amount with log transformation
pipeline_amount = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())])

param_grid_amount = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30]
}

grid_search_amount = GridSearchCV(pipeline_amount, param_grid_amount, cv=5, scoring='neg_mean_squared_error')
grid_search_amount.fit(X_train, y_train_amount)

best_model_amount = grid_search_amount.best_estimator_

# Predict and evaluate for loan amount
y_pred_amount = best_model_amount.predict(X_test)
mse_amount = mean_squared_error(np.expm1(y_test_amount), np.expm1(y_pred_amount))  # Reverse log transformation
r2_amount = r2_score(np.expm1(y_test_amount), np.expm1(y_pred_amount))
print(f'Loan Amount - Mean Squared Error: {mse_amount}')
print(f'Loan Amount - R^2 Score: {r2_amount}')

# Pipeline and Grid Search for Loan Term
pipeline_term = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())])

param_grid_term = {
    'regressor__alpha': [0.1, 1.0, 10.0]
}

grid_search_term = GridSearchCV(pipeline_term, param_grid_term, cv=5, scoring='neg_mean_squared_error')
grid_search_term.fit(X_train, y_train_term)

best_model_term = grid_search_term.best_estimator_

# Predict and evaluate for loan term
y_pred_term = best_model_term.predict(X_test)
mse_term = mean_squared_error(y_test_term, y_pred_term)
r2_term = r2_score(y_test_term, y_pred_term)
print(f'Loan Term - Mean Squared Error: {mse_term}')
print(f'Loan Term - R^2 Score: {r2_term}')

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
6,7,5,Graduate,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
8,9,0,Graduate,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
10,11,4,Graduate,Yes,2900000,11200000,2,547,8100000,4700000,9500000,3100000,Approved
13,14,2,Graduate,Yes,9100000,31500000,14,679,10800000,16600000,20900000,5000000,Approved
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4261,4262,3,Graduate,Yes,3000000,7500000,6,881,1400000,4500000,6100000,2300000,Approved
4263,4264,3,Graduate,No,5000000,12700000,14,865,4700000,8100000,19500000,6300000,Approved
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')
Loan Amount - Mean Squared Error: 11043441887968.523
Loan Amount - R^2 Score: 0.8715156751225672
Loan Term - Mean Squared Error: 33.29259891011624
Loan Term - R^2 Score: 0.04877943133775531


In [64]:
sample_input = pd.DataFrame({
    ' no_of_dependents': [2],
    ' education': [0],
    ' self_employed': [0],
    ' income_annum': [960000],
    ' cibil_score': [778],
    ' residential_assets_value': [2400],
    ' commercial_assets_value': [1760090],
    ' luxury_assets_value': [227000],
    ' bank_asset_value': [80000]
})

predicted_loan_amount = model_amount.predict(sample_input)
predicted_loan_term = model_term.predict(sample_input)

print(f'Predicted Max Loan Amount: {predicted_loan_amount[0]}')
print(f'Predicted Loan Term: {predicted_loan_term[0]}')

Predicted Max Loan Amount: 34100000
Predicted Loan Term: 2




In [26]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
# Encode categorical variables using OneHotEncoding
categorical_features = ['education', 'self_employed']
numerical_features = ['no_of_dependents', 'income_annum', 'cibil_score',
                      'residential_assets_value', 'commercial_assets_value',
                      'luxury_assets_value', 'bank_asset_value']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Define features and target variables
X = loan_approval_df.drop(columns=['loan_id', 'loan_amount', 'loan_term', 'loan_status'])
y_loan_amount = loan_approval_df['loan_amount']
y_loan_term = loan_approval_df['loan_term']

# Split the data into training and testing sets
X_train, X_test, y_train_amount, y_test_amount = train_test_split(X, y_loan_amount, test_size=0.2, random_state=42)
_, _, y_train_term, y_test_term = train_test_split(X, y_loan_term, test_size=0.2, random_state=42)

# Train and evaluate multiple models for loan term prediction
models = {
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'SVR': SVR()
}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)])

    pipeline.fit(X_train, y_train_term)
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test_term, y_pred)
    r2 = r2_score(y_test_term, y_pred)

    print(f'{name} - Loan Term - Mean Squared Error: {mse}')
    print(f'{name} - Loan Term - R^2 Score: {r2}')

# Further improvement with polynomial features for the best model (example with Ridge)
best_model = Ridge()

pipeline_poly = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('regressor', best_model)])

param_grid_poly = {
    'regressor__alpha': [0.1, 1.0, 10.0],
    'poly__degree': [1, 2, 3]
}

grid_search_poly = GridSearchCV(pipeline_poly, param_grid_poly, cv=5, scoring='neg_mean_squared_error')
grid_search_poly.fit(X_train, y_train_term)

best_model_poly = grid_search_poly.best_estimator_

# Predict and evaluate for loan term with polynomial features
y_pred_term_poly = best_model_poly.predict(X_test)
mse_term_poly = mean_squared_error(y_test_term, y_pred_term_poly)
r2_term_poly = r2_score(y_test_term, y_pred_term_poly)
print(f'Polynomial Features - Loan Term - Mean Squared Error: {mse_term_poly}')
print(f'Polynomial Features - Loan Term - R^2 Score: {r2_term_poly}')

Ridge - Loan Term - Mean Squared Error: 32.010382702153635
Ridge - Loan Term - R^2 Score: 0.000978910518924292
RandomForest - Loan Term - Mean Squared Error: 33.85511803278688
RandomForest - Loan Term - R^2 Score: -0.056593956289989755
GradientBoosting - Loan Term - Mean Squared Error: 32.434958506067574
GradientBoosting - Loan Term - R^2 Score: -0.012271795857818457
SVR - Loan Term - Mean Squared Error: 32.902794837492216
SVR - Loan Term - R^2 Score: -0.026872632276035757
Polynomial Features - Loan Term - Mean Squared Error: 32.01044952169503
Polynomial Features - Loan Term - R^2 Score: 0.0009768251289536956
