In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

In [4]:
df = pd.read_csv("s3://german-credit-255423/datos/original/german_credit_data.csv")

In [5]:
df["Risk"].value_counts()/df["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [6]:
train, test = train_test_split(df, test_size=0.3, 
                               random_state=42, stratify=df["Risk"])

In [7]:
train["Risk"].value_counts()/train["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

In [8]:
test["Risk"].value_counts()/test["Risk"].count()

good    0.7
bad     0.3
Name: Risk, dtype: float64

## Dividir en train, test

In [9]:
train.to_csv("s3://german-credit-255423/datos/train/train.csv", index=False)

In [10]:
test.to_csv("s3://german-credit-255423/datos/test/test.csv", index=False)

## Exploración

In [11]:
pd.crosstab(index=train["Housing"], columns=train["Risk"], margins=True)

Risk,bad,good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free,29,45,74
own,133,371,504
rent,48,74,122
All,210,490,700


In [12]:
tabla_sex = pd.crosstab(index=train["Sex"], columns=train["Risk"], margins=True)

In [13]:
tabla_sex

Risk,bad,good,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,76,138,214
male,134,352,486
All,210,490,700


In [14]:
tabla_sex["proba"] = tabla_sex.iloc[:, 1]/tabla_sex.iloc[:, 2]

In [15]:
tabla_sex

Risk,bad,good,All,proba
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,76,138,214,0.64486
male,134,352,486,0.72428
All,210,490,700,0.7


## Preprocesamiento

## Imputación

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
X_train = train[["Age", "Credit amount", "Duration"]]

In [18]:
imputer = SimpleImputer(strategy='mean')

In [19]:
imputer.fit(X_train)

SimpleImputer()

In [20]:
imputer.statistics_

array([  35.40285714, 3236.12142857,   21.04857143])

In [21]:
X_train["Age"].mean()

35.402857142857144

In [22]:
imputer.transform([[24, 1000, np.nan], 
                  [np.nan, np.nan, 12]])



array([[  24.        , 1000.        ,   21.04857143],
       [  35.40285714, 3236.12142857,   12.        ]])

In [23]:
imputer.fit_transform(X_train)

array([[3.100e+01, 4.473e+03, 3.600e+01],
       [4.600e+01, 1.829e+03, 1.500e+01],
       [2.700e+01, 7.418e+03, 6.000e+01],
       ...,
       [6.300e+01, 1.655e+03, 1.200e+01],
       [4.900e+01, 2.096e+03, 1.200e+01],
       [3.700e+01, 3.676e+03, 6.000e+00]])

In [24]:
from sklearn.impute import KNNImputer

In [25]:
imputer = KNNImputer(n_neighbors=2)

In [26]:
d = {'peso': [40, 42,44,45,39,80,82], 'edad': [19,20,21,23,25,27,30], 'nota':[3.0,3.1,None,4.1,5.0,None,4.8]}
dfs = pd.DataFrame(data=d)
dfs

Unnamed: 0,peso,edad,nota
0,40,19,3.0
1,42,20,3.1
2,44,21,
3,45,23,4.1
4,39,25,5.0
5,80,27,
6,82,30,4.8


In [27]:
result = imputer.fit_transform(dfs)

In [28]:
result

array([[40.  , 19.  ,  3.  ],
       [42.  , 20.  ,  3.1 ],
       [44.  , 21.  ,  3.6 ],
       [45.  , 23.  ,  4.1 ],
       [39.  , 25.  ,  5.  ],
       [80.  , 27.  ,  4.45],
       [82.  , 30.  ,  4.8 ]])

## Escalado

In [50]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [30]:
scaler = StandardScaler()

In [31]:
scaler.fit(X_train[["Age"]])

StandardScaler()

In [32]:
scaler.mean_

array([35.40285714])

In [33]:
scaler.var_ ** 0.5

array([11.23479253])

In [34]:
scaler.inverse_transform([[-0.39189483]])

array([[31.00000003]])

In [35]:
X_train["Age"]

328    31
891    46
255    27
243    27
492    27
       ..
73     41
401    28
769    63
2      49
617    37
Name: Age, Length: 700, dtype: int64

In [36]:
X_train["Age"].quantile(0.5)

33.0

In [37]:
X_train["Age"].median()

33.0

In [38]:
from sklearn.preprocessing import OneHotEncoder

In [44]:
encoder = OneHotEncoder(sparse=False, drop='first')
encoder.fit_transform(train[["Sex"]])

array([[1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],

In [45]:
encoder.categories_

[array(['female', 'male'], dtype=object)]

In [47]:
encoder.inverse_transform([[0]])

array([['female']], dtype=object)

In [48]:
from sklearn.pipeline import Pipeline

In [51]:
numeric_pipeline = Pipeline(
    [
        ('Imputación con la media',   SimpleImputer(strategy='mean')),
        ('Escalado minmax', MinMaxScaler())
        
    ]
)

In [52]:
numeric_pipeline.fit(train[["Age", "Credit amount", "Duration"]])

Pipeline(steps=[('Imputación con la media', SimpleImputer()),
                ('Escalado minmax', MinMaxScaler())])

In [55]:
numeric_pipeline[0].statistics_

array([  35.40285714, 3236.12142857,   21.04857143])

In [57]:
numeric_pipeline[1].min_

array([-0.36363636, -0.01375592, -0.05882353])

In [59]:
numeric_pipeline[1].data_max_

array([   75., 18424.,    72.])

In [61]:
train[["Age", "Credit amount", "Duration"]].max()

Age                 75
Credit amount    18424
Duration            72
dtype: int64

In [64]:
numeric_pipeline.transform(train[["Age", "Credit amount", "Duration"]])

array([[0.2       , 0.23236492, 0.47058824],
       [0.47272727, 0.08688236, 0.16176471],
       [0.12727273, 0.3944096 , 0.82352941],
       ...,
       [0.78181818, 0.07730824, 0.11764706],
       [0.52727273, 0.10157368, 0.11764706],
       [0.30909091, 0.18851106, 0.02941176]])

In [65]:
numeric_pipeline.transform([[np.nan, np.nan, np.nan]])



array([[0.28005195, 0.16430733, 0.25071429]])

In [66]:
# Sex, Purpose, Housing

In [72]:
categorical_pipeline = Pipeline(
        [
            ('imputación moda', SimpleImputer(strategy='most_frequent')),
            ('onehot encoder', OneHotEncoder(sparse=False))
        ]

)

In [73]:
categorical_pipeline.fit(train[["Sex", "Purpose", "Housing"]])

Pipeline(steps=[('imputación moda', SimpleImputer(strategy='most_frequent')),
                ('onehot encoder', OneHotEncoder(sparse=False))])

In [74]:
categorical_pipeline.transform(train[["Sex", "Purpose", "Housing"]])

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [75]:
from sklearn.compose import ColumnTransformer

In [76]:
column_transformer = ColumnTransformer(
            [
                ('numeric pipeline', numeric_pipeline, [0, 1, 2]),
                ('categorical pipeline', categorical_pipeline, [3, 4, 5])
            ]
)

In [77]:
column_transformer.fit(train[["Age", 
                             "Credit amount",
                              "Duration",
                              "Sex",
                              "Purpose",
                              "Housing"]])

ColumnTransformer(transformers=[('numeric pipeline',
                                 Pipeline(steps=[('Imputación con la media',
                                                  SimpleImputer()),
                                                 ('Escalado minmax',
                                                  MinMaxScaler())]),
                                 [0, 1, 2]),
                                ('categorical pipeline',
                                 Pipeline(steps=[('imputación moda',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot encoder',
                                                  OneHotEncoder(sparse=False))]),
                                 [3, 4, 5])])

In [78]:
column_transformer.transform(train[["Age", 
                             "Credit amount",
                              "Duration",
                              "Sex",
                              "Purpose",
                              "Housing"]])

array([[0.2       , 0.23236492, 0.47058824, ..., 0.        , 1.        ,
        0.        ],
       [0.47272727, 0.08688236, 0.16176471, ..., 0.        , 1.        ,
        0.        ],
       [0.12727273, 0.3944096 , 0.82352941, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.78181818, 0.07730824, 0.11764706, ..., 0.        , 1.        ,
        0.        ],
       [0.52727273, 0.10157368, 0.11764706, ..., 0.        , 1.        ,
        0.        ],
       [0.30909091, 0.18851106, 0.02941176, ..., 0.        , 0.        ,
        1.        ]])

In [80]:
joblib.dump(column_transformer, 'preprocessing.joblib')

['preprocessing.joblib']