## Import Libraries

In [None]:
# Main libraries for data manipulation
import pandas as pd
import numpy as np

# Visualization libraries
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder

import joblib

## Load and Explore Data

In [None]:
loan_data = pd.read_csv('../data/Loan_Eligibility_Prediction.csv')

# Quick look at the data
display(loan_data.head())
print("=" * 40)
display(loan_data.info())
print("=" * 40)
display(loan_data.describe(include='all'))
print("=" * 40)
display(loan_data.isnull().sum())

## Data Visualization
### Categorical Features Distribution

In [None]:
fig = sp.make_subplots(rows=2, cols=2, subplot_titles=[
    'Gender vs Loan Status',
    'Married vs Loan Status',
    'Credit History vs Loan Status',
    'Property Area vs Loan Status'
])

fig.add_trace(
    go.Bar(
        x=loan_data['Gender'].value_counts().index,
        y=loan_data['Gender'].value_counts().values,
        name='Gender vs Loan Status',
        marker_color='indianred'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=loan_data['Married'].value_counts().index,
        y=loan_data['Married'].value_counts().values,
        name='Married vs Loan Status',
        marker_color='lightsalmon'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        x=loan_data['Credit_History'].value_counts().index,
        y=loan_data['Credit_History'].value_counts().values,
        name='Credit History vs Loan Status',
        marker_color='lightseagreen'
    ),
    row=2, col=1
)

fig.add_trace(
    go.Bar(
        x=loan_data['Property_Area'].value_counts().index,
        y=loan_data['Property_Area'].value_counts().values,
        name='Property Area vs Loan Status',
        marker_color='mediumpurple',
    ),
    row=2, col=2
)

fig.update_layout(height=900, width=900, title_text="Categorical Features vs Loan Status")
fig.show()

### Gender Impact on Loan Approval

In [None]:
fig = px.histogram(loan_data, x='Gender', color='Loan_Status', barmode='group',
                   title='Gender vs Loan Status', 
                   color_discrete_sequence=px.colors.qualitative.Pastel, 
                   template='plotly_dark')
fig.show()

## Key Observations

- Male applicants submit more applications than female applicants.
- Approval is strongly associated with credit history; applicants with positive credit history are approved at substantially higher rates.
- Married applicants have higher approval rates than single applicants.
- Applicants from semiurban areas receive approvals more frequently than those from rural areas.
- Credit score and verified income are the primary determinants of loan approval; demographic attributes are secondary.

## Model Creation, Prediction and Comparisons
### Data Encoding

In [None]:
def encode_data(df):
    df.dropna(inplace=True)
    df.drop(columns=['Customer_ID'], inplace=True)

    # Encoding categorical variables
    le = LabelEncoder()
    le_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
    for col in le_cols:
        df[col] = le.fit_transform(df[col])

    return df

loan_enc = encode_data(loan_data.copy())
loan_enc.head()

### Feature Engineering

In [None]:
loan_enc['Total_Income'] = loan_enc['Applicant_Income'] + loan_enc['Coapplicant_Income']

### Train Test Split

In [None]:
X = loan_enc.drop(columns=['Loan_Status'])
y = loan_enc['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=67
)

### Data Scaling

In [None]:
X_scale_train = X_train.copy()
X_scale_test = X_test.copy()
y_scale_train = y_train.copy()
y_scale_test = y_test.copy()

### Model 1: Logistic Regression

In [None]:
scaler = StandardScaler()
X_scale_train = scaler.fit_transform(X_scale_train)
X_scale_test = scaler.transform(X_scale_test)

log_reg = LogisticRegression(C=1, random_state=67)
log_reg.fit(X_scale_train, y_scale_train)
y_pred = log_reg.predict(X_scale_test)

scoreboard = {
    'Model': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

scoreboard['Model'].append('Logistic Regression (C=1)')
report = classification_report(y_scale_test, y_pred, output_dict=True)
scoreboard['F1 Score'].append(f1_score(y_scale_test, y_pred))
scoreboard['Precision'].append(report['1']['precision'])
scoreboard['Recall'].append(report['1']['recall'])

### Model 2: Decision Tree Classifier

In [None]:
dt_clf = DecisionTreeClassifier(max_depth=4, random_state=67)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
    
scoreboard['Model'].append('Decision Tree Classifier (max_depth=4)')
report = classification_report(y_test, y_pred, output_dict=True)
scoreboard['F1 Score'].append(f1_score(y_test, y_pred))
scoreboard['Precision'].append(report['1']['precision'])
scoreboard['Recall'].append(report['1']['recall'])

### Model 3: Random Forest Classifier

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=67)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)

scoreboard['Model'].append('Random Forest Classifier (n_estimators=100, max_depth=4)')
report = classification_report(y_test, y_pred, output_dict=True)
scoreboard['F1 Score'].append(f1_score(y_test, y_pred))
scoreboard['Precision'].append(report['1']['precision'])
scoreboard['Recall'].append(report['1']['recall'])

### Model Comparison

In [None]:
scoreboard_df = pd.DataFrame(scoreboard)
display(scoreboard_df)

# Compare models
px.bar(scoreboard_df, x='Model', y=['Precision', 'Recall', 'F1 Score'], 
       barmode='group', text_auto=True,
       title='Model Comparison: Precision, Recall and F1 Score', 
       color_discrete_sequence=px.colors.qualitative.Pastel, 
       template='plotly_dark').show()

### Best Model Selection

Best models: Logistic Regression and Random Forest

For ease of use, we'll use the Random Forest Classifier as our final model.

## Save Model

In [None]:
final_model = rf_clf

# Save the model
joblib.dump(final_model, '../models/loan_eligibility_rf.joblib')
print("Model saved successfully!")

## Prediction Function

In [None]:
def predict_loan_eligibility(input_data, model=final_model):
    """
    Predict loan eligibility using the trained model.

    Parameters:
    model: Trained machine learning model (default is Random Forest Classifier)
    input_data: DataFrame containing input features for prediction

    Returns:
    predictions: Array of predicted loan eligibility
    """
    encode_data(input_data)
    input_data['Total_Income'] = input_data['Applicant_Income'] + input_data['Coapplicant_Income']
    predictions = model.predict(input_data)
    return predictions