# Exploratory Data Analysis (EDA)

In [1]:
import pandas as pd

df = pd.read_csv('data/loan_approval_dataset.csv')
df.head(5)


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [23]:
df.shape
df.isnull().sum()


loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [21]:
print(df.dtypes)

loan_id                      int64
no_of_dependents             int64
education                   object
self_employed               object
income_annum                 int64
loan_amount                  int64
loan_term                    int64
cibil_score                  int64
residential_assets_value     int64
commercial_assets_value      int64
luxury_assets_value          int64
bank_asset_value             int64
loan_status                 object
dtype: object


In [25]:
class_distribution = df['loan_status'].value_counts()
print("Class distribution of Loan Status:")
print(class_distribution)


Class distribution of Loan Status:
loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64


In [27]:
numeric_features = df[['income_annum', 'loan_amount']]
summary_stats = numeric_features.describe()

def format_number(x):
    return "{:,.2f}".format(x)

formatted_summary = summary_stats.applymap(format_number)
print(formatted_summary)

       income_annum    loan_amount
count      4,269.00       4,269.00
mean   5,059,123.92  15,133,450.46
std    2,806,839.83   9,043,362.98
min      200,000.00     300,000.00
25%    2,700,000.00   7,700,000.00
50%    5,100,000.00  14,500,000.00
75%    7,500,000.00  21,500,000.00
max    9,900,000.00  39,500,000.00


  formatted_summary = summary_stats.applymap(format_number)


In [28]:
grouped = df.groupby('loan_status')['cibil_score'].describe()

print("Summary statistics of CIBIL Score by Loan Status:")
print(grouped)


Summary statistics of CIBIL Score by Loan Status:
              count        mean         std    min    25%    50%    75%    max
loan_status                                                                   
Approved     2656.0  703.461973  125.249016  300.0  618.0  711.0  803.0  900.0
Rejected     1613.0  429.468072   78.401752  300.0  364.0  429.0  493.0  885.0


In [29]:
import pandas as pd
from scipy.stats import chi2_contingency

categorical_features = ['education', 'self_employed']

for feature in categorical_features:
    print(f"Analyzing feature: '{feature.strip()}'")

    # Crosstab of feature vs loan_status
    cross_tab = pd.crosstab(df[ feature], df['loan_status'])
    print(cross_tab)

    # Chi-Square test for independence
    chi2, p, dof, expected = chi2_contingency(cross_tab)
    print(f"Chi2 statistic: {chi2:.2f}, p-value: {p:.4f}")
    
    if p < 0.05:
        print(f"Significant relationship detected between {feature.strip()} and loan_status.\n")
    else:
        print(f"No significant relationship between {feature.strip()} and loan_status.\n")


Analyzing feature: 'education'
loan_status   Approved  Rejected
education                       
Graduate          1339       805
Not Graduate      1317       808
Chi2 statistic: 0.08, p-value: 0.7720
No significant relationship between education and loan_status.

Analyzing feature: 'self_employed'
loan_status    Approved  Rejected
self_employed                    
No                 1318       801
Yes                1338       812
Chi2 statistic: 0.00, p-value: 1.0000
No significant relationship between self_employed and loan_status.



# Machine Learning:

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import joblib

# Strip spaces in column names and trim string data
df.columns = df.columns.str.strip()
df['loan_status'] = df['loan_status'].str.strip()


In [None]:
# Defining features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

#  imputing missing values and scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features: impute missing and one hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_and_evaluate(model, X_train, X_test, y_train, y_test, preprocessor):
    # pipeline: preprocess data
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    # Train
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label='Approved')  
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"{model.__class__.__name__} Evaluation:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print()
    
    return clf, f1  # model pipeline and F1 score for comparison

log_reg = LogisticRegression(random_state=42, max_iter=1000)
rf_clf = RandomForestClassifier(random_state=42)

# Train and evaluate Logistic Regression
log_reg_model, log_reg_f1 = train_and_evaluate(log_reg, X_train, X_test, y_train, y_test, preprocessor)

# Train and evaluate Random Forest
rf_model, rf_f1 = train_and_evaluate(rf_clf, X_train, X_test, y_train, y_test, preprocessor)

# saving trained model
joblib.dump('model.pkl')


LogisticRegression Evaluation:
Accuracy: 0.9075
F1-score: 0.9268
Confusion Matrix:
[[500  36]
 [ 43 275]]

RandomForestClassifier Evaluation:
Accuracy: 0.9789
F1-score: 0.9832
Confusion Matrix:
[[527   9]
 [  9 309]]



['model.pkl']