In [15]:
import pandas as pd
import numpy as np

In [16]:
# Step 1 — Load a Sample Dataset directly
# --------------------------
print("Loading dataset sample...")

# Read file while skipping rows randomly (to sample ~5%)
skip = lambda i: i > 0 and np.random.rand() > 0.05
data = pd.read_csv('../data/raw_loan_data.csv', skiprows=skip, low_memory=False)

print("Sample dataset loaded.")
print("Shape:", data.shape)
print(data.head())

Loading dataset sample...
Sample dataset loaded.
Shape: (112998, 151)
         id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  68466926        NaN    10000.0      10000.0          10000.0   36 months   
1  68338832        NaN     1400.0       1400.0           1400.0   36 months   
2  68466916        NaN    25000.0      25000.0          25000.0   36 months   
3  68476676        NaN    20000.0      20000.0          20000.0   36 months   
4  68617034        NaN    14650.0      14650.0          14650.0   60 months   

   int_rate  installment grade sub_grade  ... hardship_payoff_balance_amount  \
0      6.49       306.45     A        A2  ...                            NaN   
1     12.88        47.10     C        C2  ...                            NaN   
2      7.49       777.55     A        A4  ...                            NaN   
3     15.77       700.88     D        D1  ...                            NaN   
4     20.50       392.23     E        E4  ...          

In [9]:
# Step 2 — Initial Exploration
# --------------------------
print("\nDataset Info:")
print(data.info())

print("\nMissing Values (%):")
missing_percentage = data.isnull().mean() * 100
print(missing_percentage[missing_percentage > 0].sort_values(ascending=False))



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112665 entries, 0 to 112664
Columns: 151 entries, id to settlement_term
dtypes: float64(113), int64(1), object(37)
memory usage: 129.8+ MB
None

Missing Values (%):
member_id                                     100.000000
orig_projected_additional_accrued_interest     99.606799
hardship_loan_status                           99.516265
hardship_start_date                            99.516265
hardship_end_date                              99.516265
                                                 ...    
inq_last_6mths                                  0.000888
earliest_cr_line                                0.000888
open_acc                                        0.000888
pub_rec                                         0.000888
total_acc                                       0.000888
Length: 111, dtype: float64


In [17]:
# Step 3 — Cleaning
# --------------------------
print("\nCleaning dataset...")

# Drop rows with missing target
data = data.dropna(subset=['loan_status'])

# Fill numeric NaNs
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Fill categorical NaNs
cat_cols = data.select_dtypes(include=['object']).columns
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])



Cleaning dataset...


In [18]:
# Step 4 — Encoding
# --------------------------
print("\nEncoding categorical columns...")
cat_cols = data.select_dtypes(include=['object']).columns

# Encode only a few categorical columns to prevent memory overflow
for col in cat_cols:
    if data[col].nunique() <= 10:  # encode only if categories ≤ 10
        data = pd.get_dummies(data, columns=[col], drop_first=True)
    else:
        data[col] = data[col].astype('category').cat.codes

print("Encoded dataset shape:", data.shape)


Encoding categorical columns...
Encoded dataset shape: (112996, 180)


In [19]:
# Step 5 — Save Cleaned Dataset
# --------------------------
clean_path = '../data/cleaned_loan_data.csv'
data.to_csv(clean_path, index=False)
print(f"Cleaned dataset saved to {clean_path}")


Cleaned dataset saved to ../data/cleaned_loan_data.csv


In [20]:
# Step 6 — Final Check
# --------------------------
print("\nFinal dataset info:")
print(data.info())

print("\nFirst 5 rows of cleaned data:")
print(data.head())


Final dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 112996 entries, 0 to 112997
Columns: 180 entries, id to settlement_status_COMPLETE
dtypes: bool(46), float64(113), int16(7), int32(3), int8(11)
memory usage: 107.2 MB
None

First 5 rows of cleaned data:
      id  member_id  loan_amnt  funded_amnt  funded_amnt_inv  int_rate  \
0  84534        NaN    10000.0      10000.0          10000.0      6.49   
1  84253        NaN     1400.0       1400.0           1400.0     12.88   
2  84533        NaN    25000.0      25000.0          25000.0      7.49   
3  84552        NaN    20000.0      20000.0          20000.0     15.77   
4  84860        NaN    14650.0      14650.0          14650.0     20.50   

   installment  sub_grade  emp_title  emp_length  ...  hardship_status_BROKEN  \
0       306.45          1      30357           6  ...                   False   
1        47.10         11      20451           3  ...                   False   
2       777.55          3      30896       