In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("s3://german-credit-255423/datos/original/german_credit_data.csv")

In [13]:
df["Risk"].value_counts()/df["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [16]:
train, test = train_test_split(df, test_size=0.3, 
                               random_state=42, stratify=df["Risk"])

In [17]:
train["Risk"].value_counts()/train["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [19]:
test["Risk"].value_counts()/test["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

## Dividir en train, test

In [24]:
train.to_csv("s3://german-credit-255423/datos/train/train.csv", index=False)

In [23]:
test.to_csv("s3://german-credit-255423/datos/test/test.csv", index=False)

## Exploración

In [27]:
pd.crosstab(index=train["Housing"], columns=train["Risk"], margins=True)

Risk,bad,good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free,29,45,74
own,133,371,504
rent,48,74,122
All,210,490,700


In [36]:
tabla_sex = pd.crosstab(index=train["Sex"], columns=train["Risk"], margins=True)

In [38]:
tabla_sex

Risk,bad,good,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,76,138,214
male,134,352,486
All,210,490,700


In [40]:
tabla_sex["proba"] = tabla_sex.iloc[:, 1]/tabla_sex.iloc[:, 2]

In [41]:
tabla_sex

Risk,bad,good,All,proba
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,76,138,214,0.64486
male,134,352,486,0.72428
All,210,490,700,0.7


## Preprocesamiento

## Imputación

In [43]:
from sklearn.impute import SimpleImputer

In [44]:
X_train = train[["Age", "Credit amount", "Duration"]]

In [45]:
imputer = SimpleImputer(strategy='mean')

In [46]:
imputer.fit(X_train)

SimpleImputer()

In [47]:
imputer.statistics_

array([  35.40285714, 3236.12142857,   21.04857143])

In [50]:
X_train["Age"].mean()

35.402857142857144

In [51]:
imputer.transform([[24, 1000, np.nan], 
                  [np.nan, np.nan, 12]])



array([[  24.        , 1000.        ,   21.04857143],
       [  35.40285714, 3236.12142857,   12.        ]])

In [52]:
imputer.fit_transform(X_train)

array([[3.100e+01, 4.473e+03, 3.600e+01],
       [4.600e+01, 1.829e+03, 1.500e+01],
       [2.700e+01, 7.418e+03, 6.000e+01],
       ...,
       [6.300e+01, 1.655e+03, 1.200e+01],
       [4.900e+01, 2.096e+03, 1.200e+01],
       [3.700e+01, 3.676e+03, 6.000e+00]])

In [53]:
from sklearn.impute import KNNImputer

In [54]:
imputer = KNNImputer(n_neighbors=2)

In [55]:
d = {'peso': [40, 42,44,45,39,80,82], 'edad': [19,20,21,23,25,27,30], 'nota':[3.0,3.1,None,4.1,5.0,None,4.8]}
dfs = pd.DataFrame(data=d)
dfs

Unnamed: 0,peso,edad,nota
0,40,19,3.0
1,42,20,3.1
2,44,21,
3,45,23,4.1
4,39,25,5.0
5,80,27,
6,82,30,4.8


In [56]:
result = imputer.fit_transform(dfs)

In [57]:
result

array([[40.  , 19.  ,  3.  ],
       [42.  , 20.  ,  3.1 ],
       [44.  , 21.  ,  3.6 ],
       [45.  , 23.  ,  4.1 ],
       [39.  , 25.  ,  5.  ],
       [80.  , 27.  ,  4.45],
       [82.  , 30.  ,  4.8 ]])

## Escalado

In [58]:
from sklearn.preprocessing import StandardScaler, RobustScaler

In [59]:
scaler = StandardScaler()

In [69]:
scaler.fit(X_train[["Age"]])

StandardScaler()

In [70]:
scaler.mean_

array([35.40285714])

In [71]:
scaler.var_ ** 0.5

array([11.23479253])

In [73]:
scaler.inverse_transform([[-0.39189483]])

array([[31.00000003]])

In [75]:
X_train["Age"]

328    31
891    46
255    27
243    27
492    27
       ..
73     41
401    28
769    63
2      49
617    37
Name: Age, Length: 700, dtype: int64

In [77]:
X_train["Age"].quantile(0.5)

33.0

In [78]:
X_train["Age"].median()

33.0