# Chaining Tranformers

In [35]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder


In [36]:
estimators = [
  ('simpleImputer', SimpleImputer()),
  ('pca', PCA()),
  ('standardScaler', StandardScaler())
]
cat_est =[
  ('one_hot_encoder', OneHotEncoder())
]
pipe = Pipeline(steps= estimators, memory= 'test_cache_dir')
cat_pipe = Pipeline(steps = cat_est, memory='test_cache_dir')

In [37]:
arr = np.array([[123,234], [234, 324]])
y = [1, 1]
a = pipe.fit_transform(X=arr, y=y)

In [38]:
a

array([[ 1.,  0.],
       [-1.,  0.]])

In [39]:
arr

array([[123, 234],
       [234, 324]])

In [40]:
full_pipe_line = FeatureUnion(transformer_list=
                              [
                                ('num_pipe', pipe),
                                ('cat_pipe', cat_pipe)])

# Data Preprocessing techniques

In [43]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# sns.set_theme('whitegrid')

In [45]:
data = [{'age': 4,  'height': 96.0},
        {'age': 1,  'height': 73.9},
        {'age': 2,  'height': 88.9},
        {'age': 3,  'height': 81.6}]

In [46]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
data_transformed = dv.fit_transform(data)
data_transformed

array([[ 4. , 96. ],
       [ 1. , 73.9],
       [ 2. , 88.9],
       [ 3. , 81.6]])

In [47]:
data_transformed.shape

(4, 2)

# 2. Data Imputation

In [48]:
from sklearn.impute import SimpleImputer


In [79]:
cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
# heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',)
heart_data = pd.read_csv('processed.cleveland.data')
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [80]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [81]:
heart_data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [82]:
heart_data.info()
list(heart_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'num']

In [83]:
print('unique values in ca:', heart_data.ca.unique())
print('unique value in thal:', heart_data.thal.unique())

unique values in ca: ['0.0' '3.0' '2.0' '1.0' '?']
unique value in thal: ['6.0' '3.0' '7.0' '?']


In [84]:
heart_data.replace('?', np.nan, inplace=True)

In [91]:
imputer = SimpleImputer(missing_values= np.nan, strategy='mean')
imputer = imputer.fit(heart_data)
heart_data_imputed = imputer.transform(heart_data)
heart_data_imputed.shape

(303, 14)

In [92]:
imputer = SimpleImputer(missing_values= np.nan, strategy= 'mean', 
                        add_indicator= True)
imputer = imputer.fit(heart_data)
heart_data_imputed_with_indicator = imputer.transform(heart_data)
heart_data_imputed_with_indicator.shape

(303, 16)

In [1]:
# a = pd.DataFrame(heart_data_imputed_with_indicator, columns=list(heart_data).extend(['impute1', 'impute2']))