# Loan Default Risk Analysis
EDA Case Study

## Business Problem
- Identify key drivers of loan default.
- Minimize credit loss by denying or adjusting risky loans.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
# Load data
loan_df = pd.read_csv('/content/sample_data/loan.csv', low_memory=False)
loan_df.shape

## Data Cleaning
- Drop columns with >80% missing
- Drop ID columns

In [None]:
# Drop columns with >80% missing
def drop_high_na(df, threshold=0.8):
    return df.loc[:, df.isnull().mean() < threshold]

loan_df = drop_high_na(loan_df)
loan_df.drop(['id', 'member_id'], axis=1, inplace=True, errors='ignore')
loan_df.shape

## Univariate Analysis
- Distribution of target variable
- Distribution of key features

In [None]:
loan_df['loan_status'].value_counts(normalize=True).plot(kind='bar')
plt.title('Loan Status Distribution')
plt.show()

## Derived Target Variable: is_default

In [None]:
loan_df['is_default'] = loan_df['loan_status'].apply(lambda x: 1 if x == 'Charged Off' else 0)
loan_df['is_default'].value_counts()

## Bivariate Analysis
- Compare interest rate, loan amount, grade by default status

In [None]:
sns.boxplot(x='is_default', y='int_rate', data=loan_df)
plt.title('Interest Rate vs Default')
plt.show()

## Conclusions
- Higher interest rates lead to higher defaults
- Certain purposes (e.g., small_business) have higher default rates
- Grade and sub_grade are strong predictors