In [None]:
##**This notebook has two output csv files as of 6-02-25.  One for handling the imbalanced data with SMOTE and one without SMOTE**
## Reason for this is because I want to test without smote and using class weights penalties with Keras so see if I get
## better results.

import pandas as pd

files = ['input.csv', 'output.csv', 'loan_data.csv']
for fn in files:
    path = f'../data/{fn}'     # note the ../ to go up one directory
    df = pd.read_csv(path)
    print(f"{fn}: {df.shape}")
    display(df.head())





In [None]:
# 1. Inspect structure & missing values
df = pd.read_csv('../data/loan_data.csv')
df.info()

# 2. Quick summary stats
df.describe()

# 3. Examine the target balance
print(df['not.fully.paid'].value_counts(normalize=True))


In [None]:
df.describe().T


In [None]:
df['purpose'].value_counts()

In [None]:
# these unique values for purpose all have a good amount in them so no need to group and I will just need to one-hot encode them


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Create engineered columns
df['revol_bal_log']     = np.log1p(df['revol.bal'])
df['cr_line_years']     = df['days.with.cr.line'] / 365
df['cr_line_years_log'] = np.log1p(df['cr_line_years'])
df['pub_rec_flag']      = (df['pub.rec'] > 0).astype(int)
df['inquiry_rate']      = df['inq.last.6mths'] / df['cr_line_years']

# 2. Plot distributions before/after and of new features
fig, axes = plt.subplots(3, 2, figsize=(12, 12))

# Raw vs log revol.bal
sns.histplot(df['revol.bal'], ax=axes[0,0], kde=True)
axes[0,0].set_title('revol.bal (raw)')
sns.histplot(df['revol_bal_log'], ax=axes[0,1], kde=True)
axes[0,1].set_title('revol_bal_log (log1p)')

# Raw vs log credit-line years
sns.histplot(df['cr_line_years'], ax=axes[1,0], kde=True)
axes[1,0].set_title('cr_line_years (raw)')
sns.histplot(df['cr_line_years_log'], ax=axes[1,1], kde=True)
axes[1,1].set_title('cr_line_years_log (log1p)')

# New engineered features
sns.histplot(df['pub_rec_flag'], ax=axes[2,0], kde=False)
axes[2,0].set_title('pub_rec_flag')
sns.histplot(df['inquiry_rate'], ax=axes[2,1], kde=True)
axes[2,1].set_title('inquiry_rate')

plt.tight_layout()
plt.show()



In [None]:
print("Raw revol.bal skew:", df['revol.bal'].skew())
print("Log revol.bal skew:", df['revol_bal_log'].skew())
print("Raw cr_line_years skew:", df['cr_line_years'].skew())
print("Log cr_line_years skew:", df['cr_line_years_log'].skew())
# huge positive skews (>1) dropped to small negative values (near symmetric)

In [None]:
from sklearn.preprocessing import StandardScaler

# List the numeric columns to scale
num_cols = [
    'revol_bal_log',
    'cr_line_years_log',
    'int.rate',
    'installment',
    'log.annual.inc',
    'dti',
    'fico',
    'revol.util',
    'inq.last.6mths',
    'delinq.2yrs',
    'pub.rec',
    'inquiry_rate'
]

# Initialize scaler and fit_transform on the entire df for now
# (In practice, we'll do train/test split first—but for EDA/demo we can scale everything)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Quick sanity check: all means ~0, stds ~1
df[num_cols].describe().loc[['mean','std']].T



In [None]:
# One-hot encode purpose, drop the first category to avoid multicollinearity
df = pd.get_dummies(df, columns=['purpose'], drop_first=True)

# Check new columns
[p for p in df.columns if p.startswith('purpose_')]


In [None]:
# save a csv of the transformed and clean data ready for the model
df.to_csv('../data/loan_data_ready.csv', index=False)

In [1]:
# **** Do-over this time transforming the csv differently to see if I get different results with same model *****
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 1. Reload true raw data (unmodified loan_data.csv)
df_raw = pd.read_csv('../data/loan_data.csv')

# 2. Engineer features (except log‐transforms)
df_raw['cr_line_years'] = df_raw['days.with.cr.line'] / 365
df_raw['pub_rec_flag']  = (df_raw['pub.rec'] > 0).astype(int)
df_raw['inquiry_rate']  = df_raw['inq.last.6mths'] / df_raw['cr_line_years']

# 3. One‐hot encode 'purpose'
df_raw = pd.get_dummies(df_raw, columns=['purpose'], drop_first=True)

# 4. Drop the raw skewed columns (we want to keep them un‐logged but scaled)

df_raw = df_raw.drop(columns=['revol.bal', 'days.with.cr.line'])

# 5. Standard‐scale numeric features (no log columns here):
num_cols_raw = [
    'int.rate',
    'installment',
    'log.annual.inc',
    'dti',
    'fico',
    'revol.util',
    'inq.last.6mths',
    'delinq.2yrs',
    'pub.rec',
    'cr_line_years',
    'inquiry_rate'
]



scaler = StandardScaler().fit(df_raw[num_cols_raw])
df_raw[num_cols_raw] = scaler.transform(df_raw[num_cols_raw])

# 6. Save as a new CSV variant
df_raw.to_csv('../data/loan_data_ready_raw.csv', index=False)

# 7. Quick verification
print("Columns in df_raw:", df_raw.columns.tolist())
print("Sample scaled stats:\n", df_raw[num_cols_raw].describe().loc[['mean','std']].T)



Columns in df_raw: ['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid', 'cr_line_years', 'pub_rec_flag', 'inquiry_rate', 'purpose_credit_card', 'purpose_debt_consolidation', 'purpose_educational', 'purpose_home_improvement', 'purpose_major_purchase', 'purpose_small_business']
Sample scaled stats:
                         mean       std
int.rate       -4.747832e-17  1.000052
installment    -4.896202e-17  1.000052
log.annual.inc  1.348681e-15  1.000052
dti            -7.121748e-17  1.000052
fico            2.848699e-16  1.000052
revol.util      4.154353e-17  1.000052
inq.last.6mths  2.373916e-17  1.000052
delinq.2yrs     1.186958e-17  1.000052
pub.rec         5.638051e-17  1.000052
cr_line_years  -6.528269e-17  1.000052
inquiry_rate    0.000000e+00  1.000052
