In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

import os
import sys
sys.executable

seed = 42
input_file = '../data/raw/'
output_file = '../data/processed/'

In [2]:
# Import the dataset from a CSV file into a pandas DataFrame
df = pd.read_csv(f'{input_file}credit.csv')
print('Raw Dataset:', df.shape)

Raw Dataset: (1000, 21)


In [3]:
df.isna().sum()

checking_account_status      0
duration                     0
credit_history               0
purpose                      0
amount                       0
savings_account_status       0
employment_years             0
installment_rate             0
personal_status              0
other_debtors              907
present_residence            0
property                     0
age                          0
other_installment_plans    814
housing                      0
number_of_credits            0
job                          0
people_liable                0
telephone                    0
foreign_worker               0
full_repaid                  0
dtype: int64

In [4]:
# Convert the 'full_repaid' column to boolean type
df['full_repaid'] = df['full_repaid'].astype('bool')

# Define the list of features to be used in the analysis
# Note: Features with NaN values were not included in this selection
features = [
    'amount', 'installment_rate', 'present_residence', 'age', 
    'number_of_credits', 'people_liable', 'credit_history', 
    'purpose', 'personal_status', 'property', 'job',
    'housing', 'telephone', 'foreign_worker', 
    'checking_account_status', 'savings_account_status', 'employment_years'
]
df = df[features + ['duration','full_repaid']]

# Define a ranking for the 'checking_account_status' column
rankings = {
    'no_account': 0,
    'below_0': -1,
    '0_to_200': 2,
    'above_200': 5
}
df['checking_account_status'] = df['checking_account_status'].map(rankings)

# Define a ranking for the 'employment_years' column
rankings = {
    'unemployed': 0,
    'below_1': 1,
    'between_1_4': 3,
    'between_4_7': 5,
    'above_7': 7
}

df['employment_years'] = df['employment_years'].map(rankings)

# Define a ranking for the 'savings_account_status' column
rankings = {
    'unknown': 0,
    'below_100': 1,
    'between_100_500': 3,
    'between_500_1000': 5,
    'above_1000': 7
}
df['savings_account_status'] = df['savings_account_status'].map(rankings)

print('Dataset:', df.shape)
df.head()

Dataset: (1000, 19)


Unnamed: 0,amount,installment_rate,present_residence,age,number_of_credits,people_liable,credit_history,purpose,personal_status,property,job,housing,telephone,foreign_worker,checking_account_status,savings_account_status,employment_years,duration,full_repaid
0,1169,4,4,67,2,1,critical_account,radio_TV,M_single,real_estate,official,own,1,1,-1,0,7,6,True
1,5951,2,2,22,1,1,existing_credit_paid,radio_TV,F_not_single,real_estate,official,own,0,1,2,1,3,48,False
2,2096,2,3,49,1,2,critical_account,education,M_single,real_estate,unskilled,own,0,1,0,1,5,12,True
3,7882,2,4,45,1,2,existing_credit_paid,furniture_equipment,M_single,life_insurance,official,free,0,1,-1,1,5,42,True
4,4870,3,4,53,2,2,delay_in_paying,car_new,M_single,no_property,official,free,0,1,-1,1,3,24,False


In [5]:
# Censoring rate
df.full_repaid.value_counts(normalize=True)

full_repaid
True     0.7
False    0.3
Name: proportion, dtype: float64

In [6]:
# Split the dataset into training and testing sets (70/30)
train, test = train_test_split(df, test_size=0.3, random_state=seed)

# Further split the training set into training and validation sets
train, validation = train_test_split(train, test_size=0.2, random_state=seed)

print('Test: ', test.shape)    
print('Validation: ', validation.shape)  
print('Train: ', train.shape)  

train.to_csv(f'{output_file}/train.csv', index=False)
validation.to_csv(f'{output_file}/validation.csv', index=False)
test.to_csv(f'{output_file}/test.csv', index=False)

Test:  (300, 19)
Validation:  (140, 19)
Train:  (560, 19)
