In [76]:
import matplotlib.pyplot as plt
import pandas as pd

from io import StringIO
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [77]:
df = pd.read_csv('credit_score_classification.csv', low_memory=False)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [79]:
df.loc[:, 'Outstanding_Debt':'Monthly_Balance'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Outstanding_Debt          100000 non-null  object 
 1   Credit_Utilization_Ratio  100000 non-null  float64
 2   Credit_History_Age        90970 non-null   object 
 3   Payment_of_Min_Amount     100000 non-null  object 
 4   Total_EMI_per_month       100000 non-null  float64
 5   Amount_invested_monthly   95521 non-null   object 
 6   Payment_Behaviour         100000 non-null  object 
 7   Monthly_Balance           98800 non-null   object 
dtypes: float64(2), object(6)
memory usage: 6.1+ MB


In [80]:
df.describe()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0,100000.0
mean,4194.17085,17.09128,22.47443,72.46604,21.06878,27.754251,32.285173,1403.118217
std,3183.686167,117.404834,129.05741,466.422621,14.860104,193.177339,5.116875,8306.04127
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.0,0.0
25%,1625.568229,3.0,4.0,8.0,10.0,3.0,28.052567,30.30666
50%,3093.745,6.0,5.0,13.0,18.0,6.0,32.305784,69.249473
75%,5957.448333,7.0,7.0,20.0,28.0,9.0,36.496663,161.224249
max,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,82331.0


# Ajuste de data types

In [81]:

df['ID'] = df['ID'].astype('string')
df['Customer_ID'] = df['Customer_ID'].astype('string')
df['Month'] = df['Month'].astype('string')
df['Name'] = df['Name'].astype('string')
df['SSN'] = df['SSN'].astype('string')
df['Occupation'] = df['Occupation'].astype('string')
df['Type_of_Loan'] = df['Type_of_Loan'].astype('string')
df['Payment_Behaviour'] = df['Payment_Behaviour'].astype('string')
df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].astype('string')

df['Age'] = df['Age'].str.replace('_', '').astype('int')
df['Annual_Income'] = df['Annual_Income'].str.replace('_', '').astype('float')
df['Num_of_Loan'] = df['Num_of_Loan'].str.replace('_', '').astype('int')
df['Outstanding_Debt'] = df['Outstanding_Debt'].str.replace('_', '').astype('float')
df['Amount_invested_monthly'] = df['Amount_invested_monthly'].str.replace('_', '').astype('float')
df['Monthly_Balance'] = df['Monthly_Balance'].str.replace('_', '').astype('float')
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.replace('_', '').astype('float').fillna(0).astype('int')
df['Changed_Credit_Limit'] = df['Changed_Credit_Limit'].str.replace('_', '0').astype('float').fillna(0)
df['Credit_Mix'] = df['Credit_Mix'].str.replace('_', 'Não informado').astype('string')
df['Credit_History_Age'] = df['Credit_History_Age'].fillna('Não informado').astype('string')

df['Credit_Score'] = df['Credit_Score'].astype('category')

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   ID                        100000 non-null  string  
 1   Customer_ID               100000 non-null  string  
 2   Month                     100000 non-null  string  
 3   Name                      90015 non-null   string  
 4   Age                       100000 non-null  int64   
 5   SSN                       100000 non-null  string  
 6   Occupation                100000 non-null  string  
 7   Annual_Income             100000 non-null  float64 
 8   Monthly_Inhand_Salary     84998 non-null   float64 
 9   Num_Bank_Accounts         100000 non-null  int64   
 10  Num_Credit_Card           100000 non-null  int64   
 11  Interest_Rate             100000 non-null  int64   
 12  Num_of_Loan               100000 non-null  int64   
 13  Type_of_Loan              8859

In [83]:
df.loc[:, 'Outstanding_Debt':'Monthly_Balance'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Outstanding_Debt          100000 non-null  float64
 1   Credit_Utilization_Ratio  100000 non-null  float64
 2   Credit_History_Age        100000 non-null  string 
 3   Payment_of_Min_Amount     100000 non-null  string 
 4   Total_EMI_per_month       100000 non-null  float64
 5   Amount_invested_monthly   95521 non-null   float64
 6   Payment_Behaviour         100000 non-null  string 
 7   Monthly_Balance           98800 non-null   float64
dtypes: float64(5), string(3)
memory usage: 6.1 MB


# Remover colunas que não entram em modelos

In [84]:
df = df.drop(['ID', 'Customer_ID', 'Month', 'Name', 'SSN', 'Type_of_Loan'], axis=1)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Age                       100000 non-null  int64   
 1   Occupation                100000 non-null  string  
 2   Annual_Income             100000 non-null  float64 
 3   Monthly_Inhand_Salary     84998 non-null   float64 
 4   Num_Bank_Accounts         100000 non-null  int64   
 5   Num_Credit_Card           100000 non-null  int64   
 6   Interest_Rate             100000 non-null  int64   
 7   Num_of_Loan               100000 non-null  int64   
 8   Delay_from_due_date       100000 non-null  int64   
 9   Num_of_Delayed_Payment    100000 non-null  int64   
 10  Changed_Credit_Limit      100000 non-null  float64 
 11  Num_Credit_Inquiries      98035 non-null   float64 
 12  Credit_Mix                100000 non-null  string  
 13  Outstanding_Debt          1000

# Limpeza de colunas categóricas

In [86]:
df['Occupation']

0        Scientist
1        Scientist
2        Scientist
3        Scientist
4        Scientist
           ...    
99995     Mechanic
99996     Mechanic
99997     Mechanic
99998     Mechanic
99999     Mechanic
Name: Occupation, Length: 100000, dtype: string

In [87]:
indices_para_remover = df[df["Occupation"].str.contains("_", na=False)].index
df.drop(index=indices_para_remover, inplace=True)

In [88]:
df['Occupation'].unique()

<StringArray>
[   'Scientist',      'Teacher',     'Engineer', 'Entrepreneur',
    'Developer',       'Lawyer',       'Doctor',   'Journalist',
      'Manager',   'Accountant',     'Musician',     'Mechanic',
       'Writer',    'Architect']
Length: 14, dtype: string

In [89]:
indices_para_remover = df[df["Credit_Mix"].str.contains("Não informado", na=False)].index
df.drop(index=indices_para_remover, inplace=True)

In [90]:
df['Credit_Mix'].unique()

<StringArray>
['Good', 'Standard', 'Bad']
Length: 3, dtype: string

In [91]:
indices_para_remover = df[df["Payment_of_Min_Amount"].str.contains("NM", na=False)].index
df.drop(index=indices_para_remover, inplace=True)

In [92]:
df['Payment_of_Min_Amount'].unique()

<StringArray>
['No', 'Yes']
Length: 2, dtype: string

In [93]:
indices_para_remover = df[df["Payment_Behaviour"].str.contains("!@9#%8", na=False)].index
df.drop(index=indices_para_remover, inplace=True)

In [94]:
df['Payment_Behaviour'].unique()

<StringArray>
[  'Low_spent_Large_value_payments',  'Low_spent_Medium_value_payments',
   'Low_spent_Small_value_payments', 'High_spent_Medium_value_payments',
  'High_spent_Large_value_payments',  'High_spent_Small_value_payments']
Length: 6, dtype: string

In [95]:
indices_para_remover = df[df["Credit_History_Age"].str.contains("Não informado", na=False)].index
df.drop(index=indices_para_remover, inplace=True)
df['Credit_History_Age'] = df['Credit_History_Age'].dropna()

In [96]:
df['Credit_History_Age']

2        22 Years and 3 Months
3        22 Years and 4 Months
4        22 Years and 5 Months
6        22 Years and 7 Months
9        26 Years and 8 Months
                 ...          
99986    5 Years and 10 Months
99991     6 Years and 3 Months
99993    31 Years and 4 Months
99997    31 Years and 8 Months
99998    31 Years and 9 Months
Name: Credit_History_Age, Length: 51171, dtype: string

# Transformar string em category

In [97]:
df['Occupation'] = df['Occupation'].astype('category')
df['Credit_Mix'] = df['Credit_Mix'].astype('category')
df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].astype('category')
df['Payment_Behaviour'] = df['Payment_Behaviour'].astype('category')

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51171 entries, 2 to 99998
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       51171 non-null  int64   
 1   Occupation                51171 non-null  category
 2   Annual_Income             51171 non-null  float64 
 3   Monthly_Inhand_Salary     43537 non-null  float64 
 4   Num_Bank_Accounts         51171 non-null  int64   
 5   Num_Credit_Card           51171 non-null  int64   
 6   Interest_Rate             51171 non-null  int64   
 7   Num_of_Loan               51171 non-null  int64   
 8   Delay_from_due_date       51171 non-null  int64   
 9   Num_of_Delayed_Payment    51171 non-null  int64   
 10  Changed_Credit_Limit      51171 non-null  float64 
 11  Num_Credit_Inquiries      50157 non-null  float64 
 12  Credit_Mix                51171 non-null  category
 13  Outstanding_Debt          51171 non-null  float64 


In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51171 entries, 2 to 99998
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       51171 non-null  int64   
 1   Occupation                51171 non-null  category
 2   Annual_Income             51171 non-null  float64 
 3   Monthly_Inhand_Salary     43537 non-null  float64 
 4   Num_Bank_Accounts         51171 non-null  int64   
 5   Num_Credit_Card           51171 non-null  int64   
 6   Interest_Rate             51171 non-null  int64   
 7   Num_of_Loan               51171 non-null  int64   
 8   Delay_from_due_date       51171 non-null  int64   
 9   Num_of_Delayed_Payment    51171 non-null  int64   
 10  Changed_Credit_Limit      51171 non-null  float64 
 11  Num_Credit_Inquiries      50157 non-null  float64 
 12  Credit_Mix                51171 non-null  category
 13  Outstanding_Debt          51171 non-null  float64 


# Tratamento de `Credit_History_Age`

In [100]:
import re

def converter_para_meses(s):
  # Usar regex para extrair os números de anos e meses
  match = re.search(r'(\d+)\s*Years?\s*(\d+)\s*Months?', s)
  if match:
    anos = int(match.group(1))
    meses = int(match.group(2))
    return anos * 12 + meses
  else:
    return None  # ou 0, dependendo do seu caso

# Aplicar no DataFrame
df['Credit_History_Age'] = df['Credit_History_Age'].apply(converter_para_meses)