In [79]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# GERMAN CREDIT DATA

In [80]:
data = pd.read_csv( 'german_credit_data.csv', index_col=0)
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [81]:
#Número de filas y columnas
data.shape

(1000, 10)

In [82]:
#Tipo de datos en la tabla
data.dtypes

Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
Risk                object
dtype: object

In [83]:
#Cantidad de datos nulos en las columnas
data.isna().sum() 

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [84]:
#Porcentaje de información única en las columnas
data.nunique()/ len(data)

Age                 0.053
Sex                 0.002
Job                 0.004
Housing             0.003
Saving accounts     0.004
Checking account    0.003
Credit amount       0.921
Duration            0.033
Purpose             0.008
Risk                0.002
dtype: float64

In [85]:
data['Risk'].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

## 1. Se completarán los datos vacíos con la moda
Se halla la moda de Saving accounts y Checking account

In [86]:
import statistics as stats

stats.mode(data['Saving accounts'])

'little'

In [87]:
data_ca = data['Checking account'].dropna()
stats.mode(data_ca)


'little'

In [88]:
data_completed = data.fillna('little')
data_completed.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,little,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,little,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


## 2. Transformación de datos cualitativos a cuantitativos

In [89]:
from sklearn import preprocessing
def encode_features(data_T):
    features = ['Sex', 'Housing','Saving accounts','Checking account','Purpose', 'Risk']

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(data_T[feature])
        data_T[feature] = le.transform(data_T[feature])

    return data_T
    
data_processed = encode_features(data_completed)
data_processed.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,0,0,1169,6,5,1
1,22,0,2,1,0,1,5951,48,5,0
2,49,1,1,1,0,0,2096,12,3,1
3,45,1,2,0,0,0,7882,42,4,1
4,53,1,2,0,0,0,4870,24,1,0


In [90]:
data_processed.dtypes

Age                 int64
Sex                 int32
Job                 int64
Housing             int32
Saving accounts     int32
Checking account    int32
Credit amount       int64
Duration            int64
Purpose             int32
Risk                int32
dtype: object

## 3. Separación en entrenamiento y en validación

In [105]:
from sklearn.model_selection import train_test_split


x = data_processed.drop('Risk',axis=1)
y = data_processed['Risk']

num_test = 0.20
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=num_test)

print(f'Train shape     : {x_train.shape}')
print(f'Validation shape: {x_val.shape}')


Train shape     : (800, 9)
Validation shape: (200, 9)


## Modelos de Entrenamiento

### Primer modelo: Árbol de Decisión

In [92]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [93]:
dtc.score(x_train, y_train)

0.89

In [94]:
dtc.score(x_val, y_val)


0.63

### Segundo modelo: Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=10, n_estimators=14, n_jobs=-1)
rfc.fit(x_train, y_train)
rfc.score(x_train, y_train)

0.90125

In [96]:
rfc.score(x_val, y_val)

0.74

In [97]:
from sklearn.model_selection import KFold
NSplits = 12
kf = KFold(n_splits=NSplits)
kf.get_n_splits(x)
print(kf)

KFold(n_splits=12, random_state=None, shuffle=False)


In [98]:
from sklearn import metrics

In [107]:
AccTotal = 0

for train_index, test_index in kf.split(x):
    X_train, X_test = x.values[train_index], x.values[test_index]
    Y_train, Y_test = y.values[train_index], y.values[test_index]
    
    rfcl = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=0, n_jobs=-1)
    rfcl.fit(X_train, Y_train)

    Y_pred = rfcl.predict(X_test)
    AccFolder = metrics.accuracy_score(Y_test, Y_pred)*100
    print("Accuracy Folder", AccFolder)
  
    AccTotal += AccFolder
  
print("Accuracy General:", AccTotal/NSplits)


Accuracy Folder 75.0
Accuracy Folder 67.85714285714286
Accuracy Folder 65.47619047619048
Accuracy Folder 67.85714285714286
Accuracy Folder 74.69879518072288
Accuracy Folder 69.87951807228916
Accuracy Folder 63.85542168674698
Accuracy Folder 57.831325301204814
Accuracy Folder 74.69879518072288
Accuracy Folder 75.90361445783132
Accuracy Folder 74.69879518072288
Accuracy Folder 63.85542168674698
Accuracy General: 69.30101357812201


### Tercer Modelo: KNN

In [108]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors= 12)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_val)
print(metrics.accuracy_score(y_val,y_pred)*100)


65.5


### Cuarto Modelo: Regresión Lineal

In [116]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(x_train, y_train)

In [117]:
lr.score(x_train, y_train) 
lr.score(x_val, y_val) 

0.008557350282420995

## Características más importantes

### Árbol de decisión

In [109]:
columnsNames = data.columns.values
listOfColumnNames = list(columnsNames)

for feature in zip(listOfColumnNames, dtc.feature_importances_):
    print(feature)

('Age', 0.14913875152406916)
('Sex', 0.020906009755392384)
('Job', 0.06222634116513606)
('Housing', 0.06323809202535878)
('Saving accounts', 0.030551760754287085)
('Checking account', 0.0347706971662727)
('Credit amount', 0.3536722562757296)
('Duration', 0.20537565963629514)
('Purpose', 0.08012043169745924)


In [110]:
from sklearn.feature_selection import SelectFromModel

select_featuresDT = SelectFromModel(dtc, threshold=0.09)
select_featuresDT.fit(x_train, y_train)

for feature_list_index in select_featuresDT.get_support(indices=True):
    print(listOfColumnNames[feature_list_index])

Age
Credit amount
Duration


### Random Forest

In [111]:
for feature in zip(listOfColumnNames, rfc.feature_importances_):
    print(feature)

('Age', 0.2031345655789212)
('Sex', 0.0340978430444437)
('Job', 0.06523104897992857)
('Housing', 0.058187189239454554)
('Saving accounts', 0.046363743973961695)
('Checking account', 0.04019422578106032)
('Credit amount', 0.2619806194734054)
('Duration', 0.19609500465172405)
('Purpose', 0.0947157592771006)


In [112]:
select_featuresRT = SelectFromModel(rfc, threshold=0.09)
select_featuresRT.fit(x_train, y_train)

for feature_list_index in select_featuresRT.get_support(indices=True):
    print(listOfColumnNames[feature_list_index])

Age
Credit amount
Duration
Purpose


### KNN

In [113]:
for feature in zip(listOfColumnNames, knn.feature_importances_):
    print(feature)

AttributeError: 'KNeighborsClassifier' object has no attribute 'feature_importances_'