# Data preparation notebook

In this notebook we prepare data based on knowledge gained from exploratory analysis (for details see notebook exploratory_analysis)



In [1]:
import pandas as pd 
import numpy as np
import random 

random.seed(42) #in case we will use random somewhere

data = pd.read_csv("./data/raw/credit_risk_dataset.csv")

In [2]:
data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


## Missing values



## Feature engineering 

Not sure if we want it, but maybe we will come up with something interesting

## Drop irrelevant columns (if any)

## One-Hot-Encode categorical data

hint: use pd.get_dummies() for categorical columns

## Data standardization

In [4]:
import warnings # for muting warning messages
# mute warning messages
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def preprocess_column(data, column_name, quantile=0.99, feature_range=(0, 1)):
   """
   Preprocess a column by removing outliers, standardizing, and min-max scaling.
  
   Parameters:
   data (pd.DataFrame): The input dataframe.
   column_name (str): The name of the column to preprocess.
   quantile (float): The quantile threshold for outlier removal.
   feature_range (tuple): The desired range of transformed data.


   Returns:
   pd.DataFrame: The dataframe with the processed column.
   """
   # Remove outliers
   column_no_outliers = data[column_name][data[column_name] < data[column_name].quantile(quantile)]
  
   # Reshape the data to fit the scaler
   column_no_outliers = column_no_outliers.values.reshape(-1, 1)
  
   # Initialize the scalers
   scaler = StandardScaler()
   min_max_scaler = MinMaxScaler(feature_range=feature_range)
  
   # Fit and transform the data with StandardScaler
   column_standardized = scaler.fit_transform(column_no_outliers)
  
   # Fit and transform the data with MinMaxScaler
   column_scaled = min_max_scaler.fit_transform(column_standardized)
  
   # Add the scaled data back to the original dataset
   data.loc[data[column_name] < data[column_name].quantile(quantile), column_name] = column_scaled.flatten()
  
   return data


# Columns to preprocess
columns_to_preprocess = ['person_income', 'person_age', 'loan_amnt', 'loan_percent_income', 'cb_person_cred_hist_length']


# Apply preprocessing to each column
for column in columns_to_preprocess:
   data = preprocess_column(data, column)


data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,0.850266,3992.614,4.789686,332.390868,11.011695,0.218164,0.387432,0.664987
std,5.909455,55954.55,4.14263,3306.362486,3.240459,0.413006,0.227869,2.892301
min,0.0,0.0,0.0,0.0,5.42,0.0,0.0,0.0
25%,0.103448,0.1561086,2.0,0.15411,7.9,0.0,0.209302,0.071429
50%,0.206897,0.2307692,4.0,0.256849,10.99,0.0,0.348837,0.142857
75%,0.344828,0.3402715,7.0,0.400685,13.47,0.0,0.534884,0.428571
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,1.0,30.0


## Save processed data 

In [None]:
#assuming final dataframe will be called data_final

data_final.to_csv('./data/processed/processed_credit_risk_dataset.csv') 

data_final.to_sql('./data/sql/processed_credit_risk_dataset.sql') 