# HDFC LOAN DATASET ANALYIS

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, binom

In [None]:
df = pd.read_json('source/hdfc_loan_dataset_cleaned.json')
print(f"Dataset: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()

## Q1

In [None]:
cols = ['Applicant_Income', 'Loan_Amount', 'Age']
stats_df = pd.DataFrame(index=['Mean', 'Median', 'Mode', 'Variance', 'Std Dev'])

for col in cols:
    stats_df.loc['Mean', col] = df[col].mean()
    stats_df.loc['Median', col] = df[col].median()
    stats_df.loc['Mode', col] = df[col].mode()[0]
    stats_df.loc['Variance', col] = df[col].var()
    stats_df.loc['Std Dev', col] = df[col].std()

cv = (stats_df.loc['Std Dev'] / stats_df.loc['Mean']) * 100
print(stats_df)
print("\nCV %:", cv)

## Q2

In [None]:
print(df['CIBIL_Score'].describe(percentiles=[.25, .5, .75])[['min', '25%', '50%', '75%', 'max']])

## Q3

In [None]:
cv_applicant = (df['Applicant_Income'].std() / df['Applicant_Income'].mean()) * 100
cv_household = (df['Annual_Household_Income'].std() / df['Annual_Household_Income'].mean()) * 100
print(f"CV Applicant: {cv_applicant:.2f}%")
print(f"CV Household: {cv_household:.2f}%")

## Q4

In [None]:
total = len(df)
p_approved = len(df[df['Loan_Status'] == 'Approved']) / total
p_good_credit = len(df[df['Credit_History'] == 1]) / total
p_both = len(df[(df['Loan_Status'] == 'Approved') & (df['Credit_History'] == 1)]) / total
print(f"P(Approved): {p_approved:.4f}")
print(f"P(Good Credit): {p_good_credit:.4f}")
print(f"P(Both): {p_both:.4f}")

## Q5

In [None]:
good_credit = df[df['Credit_History'] == 1]
p_approved_given_good = len(good_credit[good_credit['Loan_Status'] == 'Approved']) / len(good_credit)
print(f"P(Approved | Good Credit): {p_approved_given_good:.4f}")

## Q6

In [None]:
p_self = len(df[df['Employment_Status'] == 'Self-Employed']) / total
p_high = len(df[df['Loan_Amount'] > 1000000]) / total
print(f"P(Self-Employed): {p_self:.4f}")
print(f"P(Loan > 10L): {p_high:.4f}")

## Q7

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Loan_Amount'], kde=True, bins=30)
plt.title('Loan Amount Distribution')
plt.show()

## Q8

In [None]:
stat, p_val = stats.shapiro(df['Applicant_Income'])
print(f"Statistic: {stat:.4f}, p-value: {p_val:.4e}")

## Q9

In [None]:
approved = df[df['Loan_Status'] == 'Approved']
p_urban = len(approved[approved['Property_Area'] == 'Urban']) / len(approved)
print(f"P(Urban | Approved): {p_urban:.4f}")

## Q10

In [None]:
P_GC = len(df[df['Credit_History'] == 1]) / total
P_A = len(df[df['Loan_Status'] == 'Approved']) / total
bayes = (p_approved_given_good * P_GC) / P_A
direct = len(approved[approved['Credit_History'] == 1]) / len(approved)
print(f"Bayes: {bayes:.4f}")
print(f"Direct: {direct:.4f}")

## Q11

In [None]:
cov_vars = ['Applicant_Income', 'Loan_Amount', 'CIBIL_Score', 'Debt_to_Income_Ratio']
print(df[cov_vars].cov())

## Q12

In [None]:
corr1 = df['Loan_Amount'].corr(df['Annual_Household_Income'])
corr2 = df['Loan_Amount'].corr(df['CIBIL_Score'])
print(f"Loan_Amount vs Annual_Household_Income: {corr1:.4f}")
print(f"Loan_Amount vs CIBIL_Score: {corr2:.4f}")

## Q13

In [None]:
grad = df[df['Education'] == 'Graduate']['Loan_Amount']
non_grad = df[df['Education'] == 'Not Graduate']['Loan_Amount']
t_stat, p_val = stats.ttest_ind(grad, non_grad)
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_val:.4f}")
print(f"Reject H0: {p_val < 0.05}")

## Q14

In [None]:
contingency = pd.crosstab(df['Gender'], df['Loan_Status'])
chi2, p_val, dof, expected = chi2_contingency(contingency)
print(f"Chi2: {chi2:.4f}, p-value: {p_val:.4f}")

## Q15

In [None]:
mean_loan = df['Loan_Amount'].mean()
se = stats.sem(df['Loan_Amount'])
ci = stats.t.interval(0.95, len(df)-1, loc=mean_loan, scale=se)
print(f"95% CI: ({ci[0]:.2f}, {ci[1]:.2f})")

## Q16

In [None]:
mean_dti = df['Debt_to_Income_Ratio'].mean()
se_dti = stats.sem(df['Debt_to_Income_Ratio'])
ci_dti = stats.t.interval(0.95, len(df)-1, loc=mean_dti, scale=se_dti)
print(f"95% CI: ({ci_dti[0]:.4f}, {ci_dti[1]:.4f})")

## Q17

In [None]:
pop_mean = df['Applicant_Income'].mean()
sizes = [10, 50, 100, 500, 1000]
means = []

for n in sizes:
    sample_means = [df['Applicant_Income'].sample(n).mean() for _ in range(100)]
    means.append(np.mean(sample_means))

plt.plot(sizes, means, 'o-')
plt.axhline(pop_mean, color='r', linestyle='--')
plt.xlabel('Sample Size')
plt.ylabel('Mean')
plt.title('Sample Mean Convergence')
plt.show()

print(f"Population mean: {pop_mean:.2f}")

## Q18

In [None]:
p = (df['Loan_Status'] == 'Approved').mean()
var = p * (1 - p)
print(f"Mean: {p:.4f}")
print(f"Variance: {var:.4f}")

## Q19

In [None]:
prob = 1 - binom.cdf(29, 50, p)
print(f"P(X >= 30): {prob:.4f}")

## Q20

In [None]:
segment = df[(df['CIBIL_Score'] > 750) & (df['Debt_to_Income_Ratio'] < 0.35)]
p_approval = (segment['Loan_Status'] == 'Approved').mean()
print(f"P(Approved | High CIBIL & Low DTI): {p_approval:.4f}")
print(f"Segment size: {len(segment)}")

## Q21

In [None]:
print(f"E(Loan | Approved): {approved['Loan_Amount'].mean():.2f}")

## Q22

In [None]:
df['Approval'] = (df['Loan_Status'] == 'Approved').astype(int)

print(f"CIBIL: {df['CIBIL_Score'].corr(df['Approval']):.4f}")
print(f"DTI: {df['Debt_to_Income_Ratio'].corr(df['Approval']):.4f}")
print(f"Employment: {df['Employment_Length_Years'].corr(df['Approval']):.4f}")

approved_grp = df[df['Loan_Status'] == 'Approved']
rejected_grp = df[df['Loan_Status'] == 'Rejected']

t1, p1 = stats.ttest_ind(approved_grp['CIBIL_Score'], rejected_grp['CIBIL_Score'])
t2, p2 = stats.ttest_ind(approved_grp['Debt_to_Income_Ratio'], rejected_grp['Debt_to_Income_Ratio'])
t3, p3 = stats.ttest_ind(approved_grp['Employment_Length_Years'], rejected_grp['Employment_Length_Years'])

print(f"\nCIBIL p-value: {p1:.4e}")
print(f"DTI p-value: {p2:.4e}")
print(f"Employment p-value: {p3:.4e}")