In [116]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# GERMAN CREDIT DATA

In [117]:
data = pd.read_csv( 'german_credit_data.csv', index_col=0)
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [118]:
#Número de filas y columnas
data.shape

(1000, 10)

In [119]:
#Tipo de datos en la tabla
data.dtypes

Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
Risk                object
dtype: object

In [120]:
#Cantidad de datos nulos en las columnas
data.isna().sum() 

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [121]:
#Porcentaje de información única en las columnas
data.nunique()/ len(data)

Age                 0.053
Sex                 0.002
Job                 0.004
Housing             0.003
Saving accounts     0.004
Checking account    0.003
Credit amount       0.921
Duration            0.033
Purpose             0.008
Risk                0.002
dtype: float64

In [122]:
data['Risk'].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

## 1. Se completarán los datos vacíos con la moda
Se halla la moda de Saving accounts y Checking account

In [123]:
import statistics as stats

stats.mode(data['Saving accounts'])

'little'

In [124]:
dataM = data['Checking account'].dropna()
stats.mode(dataM)


'little'

In [125]:
data_completed = data.fillna('little')
data_completed.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,little,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,little,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


## 2. Transformación de datos cualitativos a cuantitativos

In [132]:
from sklearn import preprocessing
def encode_features(data_T):
    features = ['Sex', 'Housing','Saving accounts','Checking account','Purpose', 'Risk']

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(data_T[feature])
        data_T[feature] = le.transform(data_T[feature])

    return data_T
    
data_train = encode_features(data_completed)
data_train.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,0,0,1169,6,5,1
1,22,0,2,1,0,1,5951,48,5,0
2,49,1,1,1,0,0,2096,12,3,1
3,45,1,2,0,0,0,7882,42,4,1
4,53,1,2,0,0,0,4870,24,1,0


## 3. Separación en entrenamiento y en validación

In [133]:
from sklearn.model_selection import train_test_split


x = data_train.drop('Risk',axis=1)
y = data_train['Risk']

num_test = 0.20
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=num_test)

print(f'Train shape     : {x_train.shape}')
print(f'Validation shape: {x_val.shape}')


Train shape     : (800, 9)
Validation shape: (200, 9)


### Primer modelo: Árbol de Decisión

In [152]:
from sklearn.tree import DecisionTreeClassifier
m = DecisionTreeClassifier(max_depth=10)
m.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [153]:
m.score(x_train, y_train)

0.895

In [154]:
m.score(x_val, y_val)


0.685

In [155]:
from sklearn.ensemble import RandomForestClassifier

m2 = RandomForestClassifier(max_depth=10, n_estimators=14, n_jobs=-1)
m2.fit(x_train, y_train)
m2.score(x_train, y_train)

0.89625

In [156]:
m2.score(x_val, y_val)

0.715