In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import category_encoders as ce
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from feature_engine.imputation import MeanMedianImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OneHotEncoder as SOHE
from feature_engine.encoding import OneHotEncoder as FOHE

In [2]:
columns = [f'A{i}' for i in range(1,17)]
data = pd.read_csv('../data/credit_approvel/crx.data',names=columns)
data.head(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
data.isna().mean()

A1     0.0
A2     0.0
A3     0.0
A4     0.0
A5     0.0
A6     0.0
A7     0.0
A8     0.0
A9     0.0
A10    0.0
A11    0.0
A12    0.0
A13    0.0
A14    0.0
A15    0.0
A16    0.0
dtype: float64

In [4]:
masked = data == '?'
masked.sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [5]:
data.replace(to_replace='?',value=np.nan,inplace=True)
data.isna().mean()

A1     0.017391
A2     0.017391
A3     0.000000
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A8     0.000000
A9     0.000000
A10    0.000000
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.018841
A15    0.000000
A16    0.000000
dtype: float64

In [6]:
data.dtypes

A1      object
A2      object
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14     object
A15      int64
A16     object
dtype: object

In [7]:
numeric_features = data.select_dtypes(include=np.number)
categorical_feature = data.select_dtypes(include=np.object)

numeric_features.shape, categorical_feature.shape

((690, 4), (690, 12))

In [8]:
numeric_features.head(5)

Unnamed: 0,A3,A8,A11,A15
0,0.0,1.25,1,0
1,4.46,3.04,6,560
2,0.5,1.5,0,824
3,1.54,3.75,5,3
4,5.625,1.71,0,0


In [9]:
numeric_features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A3,690.0,4.758725,4.978163,0.0,1.0,2.75,7.2075,28.0
A8,690.0,2.223406,3.346513,0.0,0.165,1.0,2.625,28.5
A11,690.0,2.4,4.86294,0.0,0.0,0.0,3.0,67.0
A15,690.0,1017.385507,5210.102598,0.0,0.0,5.0,395.5,100000.0


In [10]:
categorical_feature.head(5)

Unnamed: 0,A1,A2,A4,A5,A6,A7,A9,A10,A12,A13,A14,A16
0,b,30.83,u,g,w,v,t,t,f,g,202,+
1,a,58.67,u,g,q,h,t,t,f,g,43,+
2,a,24.5,u,g,q,h,t,f,f,g,280,+
3,b,27.83,u,g,w,v,t,t,t,g,100,+
4,b,20.17,u,g,w,v,t,f,f,s,120,+


In [11]:
categorical_feature.A2.unique, categorical_feature.A14.unique

(<bound method Series.unique of 0      30.83
 1      58.67
 2      24.50
 3      27.83
 4      20.17
        ...  
 685    21.08
 686    22.67
 687    25.25
 688    17.92
 689    35.00
 Name: A2, Length: 690, dtype: object>,
 <bound method Series.unique of 0      00202
 1      00043
 2      00280
 3      00100
 4      00120
        ...  
 685    00260
 686    00200
 687    00200
 688    00280
 689    00000
 Name: A14, Length: 690, dtype: object>)

In [12]:
numeric_features.loc[:,'A2'] = categorical_feature['A2'].astype(dtype='float64')
numeric_features.loc[:,'A14'] = categorical_feature['A14'].astype(dtype='float64')
categorical_feature.drop(labels=['A2','A14'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [13]:
numeric_features

Unnamed: 0,A3,A8,A11,A15,A2,A14
0,0.000,1.25,1,0,30.83,202.0
1,4.460,3.04,6,560,58.67,43.0
2,0.500,1.50,0,824,24.50,280.0
3,1.540,3.75,5,3,27.83,100.0
4,5.625,1.71,0,0,20.17,120.0
...,...,...,...,...,...,...
685,10.085,1.25,0,0,21.08,260.0
686,0.750,2.00,2,394,22.67,200.0
687,13.500,2.00,1,1,25.25,200.0
688,0.205,0.04,0,750,17.92,280.0


In [14]:
imputer = MeanMedianImputer(imputation_method='mean')
imputer.fit(numeric_features)
imputer.imputer_dict_

{'A3': 4.758724637681159,
 'A8': 2.223405797101449,
 'A11': 2.4,
 'A15': 1017.3855072463768,
 'A2': 31.56817109144543,
 'A14': 184.01477104874445}

In [15]:
numeric_features = imputer.transform(numeric_features)
numeric_features

Unnamed: 0,A3,A8,A11,A15,A2,A14
0,0.000,1.25,1,0,30.83,202.0
1,4.460,3.04,6,560,58.67,43.0
2,0.500,1.50,0,824,24.50,280.0
3,1.540,3.75,5,3,27.83,100.0
4,5.625,1.71,0,0,20.17,120.0
...,...,...,...,...,...,...
685,10.085,1.25,0,0,21.08,260.0
686,0.750,2.00,2,394,22.67,200.0
687,13.500,2.00,1,1,25.25,200.0
688,0.205,0.04,0,750,17.92,280.0


In [16]:
numeric_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A3      690 non-null    float64
 1   A8      690 non-null    float64
 2   A11     690 non-null    int64  
 3   A15     690 non-null    int64  
 4   A2      690 non-null    float64
 5   A14     690 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 32.5 KB


In [17]:
categorical_feature.isna().sum().sort_values(ascending=False)

A1     12
A6      9
A7      9
A4      6
A5      6
A9      0
A10     0
A12     0
A13     0
A16     0
dtype: int64

In [18]:
categorical_feature.fillna(value='missing',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [19]:
categorical_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A1      690 non-null    object
 1   A4      690 non-null    object
 2   A5      690 non-null    object
 3   A6      690 non-null    object
 4   A7      690 non-null    object
 5   A9      690 non-null    object
 6   A10     690 non-null    object
 7   A12     690 non-null    object
 8   A13     690 non-null    object
 9   A16     690 non-null    object
dtypes: object(10)
memory usage: 54.0+ KB


## one hot encoding

use on nominal features and doing well if feature has less than 15 unique values.

In [20]:
for col in categorical_feature:
    print(f'{col} - {len(categorical_feature[col].unique())}')

A1 - 3
A4 - 4
A5 - 4
A6 - 15
A7 - 10
A9 - 2
A10 - 2
A12 - 2
A13 - 3
A16 - 2


In [23]:
categorical_feature['A1'].unique()

array(['b', 'a', 'missing'], dtype=object)

In [21]:
pd.get_dummies(categorical_feature['A1'])

Unnamed: 0,a,b,missing
0,0,1,0
1,1,0,0
2,1,0,0
3,0,1,0
4,0,1,0
...,...,...,...
685,0,1,0
686,1,0,0
687,1,0,0
688,0,1,0


In [25]:
pd.get_dummies(categorical_feature[['A1','A4','A5','A9','A10','A12','A13','A16']])

Unnamed: 0,A1_a,A1_b,A1_missing,A4_l,A4_missing,A4_u,A4_y,A5_g,A5_gg,A5_missing,...,A9_t,A10_f,A10_t,A12_f,A12_t,A13_g,A13_p,A13_s,A16_+,A16_-
0,0,1,0,0,0,1,0,1,0,0,...,1,0,1,1,0,1,0,0,1,0
1,1,0,0,0,0,1,0,1,0,0,...,1,0,1,1,0,1,0,0,1,0
2,1,0,0,0,0,1,0,1,0,0,...,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,0,1,0,1,0,0,...,1,0,1,0,1,1,0,0,1,0
4,0,1,0,0,0,1,0,1,0,0,...,1,1,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,0,1,0,0,0,0,1,0,0,0,...,0,1,0,1,0,1,0,0,0,1
686,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,1,1,0,0,0,1
687,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,1,0,0,0,1
688,0,1,0,0,0,1,0,1,0,0,...,0,1,0,1,0,1,0,0,0,1


In [33]:
encoder = SOHE()
encoder.fit(categorical_feature[['A1','A4']])
encoder.categories_

[array(['a', 'b', 'missing'], dtype=object),
 array(['l', 'missing', 'u', 'y'], dtype=object)]

In [31]:
encoder.transform(categorical_feature[['A1','A4']])

<690x7 sparse matrix of type '<class 'numpy.float64'>'
	with 1380 stored elements in Compressed Sparse Row format>

In [35]:
encoder = FOHE()
encoder.fit(categorical_feature[['A1','A4']])
encoder.encoder_dict_

{'A1': array(['b', 'a', 'missing'], dtype=object),
 'A4': array(['u', 'y', 'missing', 'l'], dtype=object)}

In [36]:
encoder.transform(categorical_feature[['A1','A4']])

Unnamed: 0,A1_b,A1_a,A1_missing,A4_u,A4_y,A4_missing,A4_l
0,1,0,0,1,0,0,0
1,0,1,0,1,0,0,0
2,0,1,0,1,0,0,0
3,1,0,0,1,0,0,0
4,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...
685,1,0,0,0,1,0,0
686,0,1,0,1,0,0,0
687,0,1,0,0,1,0,0
688,1,0,0,1,0,0,0
