#  Preprocessing
- unwanted data 
     * columns
        *      unique values -> drop
 # dependent variable has missing entries
  * drop the whole row
# independent column missing data (imputation)
* more than 50% missing -> drop
# less than 50% missing
  - numerical data (mean, median, algo)
 * SimpleImputer from sklearn.impute package
 - categorical data (mode, algo)
 * SimpleImputer from sklearn.impute package
## text data (encoding)
 - Text -> Numbers
 - Label Encoding
 * Dependent Column -> LabelEncoder from sklearn.preprocessing package
 * Independent Column -> OrdinalEncoder from sklearn.preprocessing package
- One Hot Encoding
  * Dummy Variable = [0, 0, 1]
  *  OneHotEncoder from sklearn.preprocessing package
# normalisation
 * min-max
 * standard
- skewness
* log
 * box-cox


In [28]:
X.shape

(201, 25)

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
pd.set_option('display.max_column',None)


In [3]:
df=pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv',index_col=0)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
df.replace('?',np.nan,inplace=True)
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [5]:
#  dropna: This is the method used to drop rows with missing values.
df.dropna(subset='price',inplace=True)

In [6]:
df.isnull().sum()


symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [7]:
#filtering data
col_missing=['normalized-losses','num-of-doors','stroke','bore','peak-rpm','horsepower']
df[col_missing]

Unnamed: 0,normalized-losses,num-of-doors,stroke,bore,peak-rpm,horsepower
0,,two,2.68,3.47,5000,111
1,,two,2.68,3.47,5000,111
2,,two,3.47,2.68,5000,154
3,164,four,3.40,3.19,5500,102
4,164,four,3.40,3.19,5500,115
...,...,...,...,...,...,...
200,95,four,3.15,3.78,5400,114
201,95,four,3.15,3.78,5300,160
202,95,four,2.87,3.58,5500,134
203,95,four,3.40,3.01,4800,106


In [8]:
missing_cols=df.isnull().sum()
missing_cols[missing_cols > 0 ].index.tolist()

['normalized-losses',
 'num-of-doors',
 'bore',
 'stroke',
 'horsepower',
 'peak-rpm']

In [9]:
num_si = SimpleImputer()
cat_si = SimpleImputer(strategy='most_frequent')
num_cols = ['normalized-losses','bore','stroke','horsepower','peak-rpm']
df[num_cols] = num_si.fit_transform(df[num_cols])

In [10]:
df[['num-of-doors']] = cat_si.fit_transform(df[['num-of-doors']])

In [11]:
df[['num-of-doors']].shape

(201, 1)

In [12]:
cat_cols=df.select_dtypes(include=['object']).columns.tolist()[:-1]
df[cat_cols]                      

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,volvo,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,volvo,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,volvo,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


In [13]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder

In [14]:
ord_cols=[]
hot_en_cols=[]
for col in df[cat_cols]:
    count=df[col].nunique()
    if count<=2:
        ord_cols.append(col)
    else:
        hot_en_cols.append(col)
        print(f'{col},{count}')
print(ord_cols)
print(hot_en_cols)

make,22
body-style,5
drive-wheels,3
engine-type,6
num-of-cylinders,7
fuel-system,8
['fuel-type', 'aspiration', 'num-of-doors', 'engine-location']
['make', 'body-style', 'drive-wheels', 'engine-type', 'num-of-cylinders', 'fuel-system']


In [15]:
ordEnc=OrdinalEncoder()
df[ord_cols]=ordEnc.fit_transform(df[ord_cols])
df[ord_cols]

Unnamed: 0,fuel-type,aspiration,num-of-doors,engine-location
0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
200,1.0,0.0,0.0,0.0
201,1.0,1.0,0.0,0.0
202,1.0,0.0,0.0,0.0
203,0.0,1.0,0.0,0.0


In [16]:
hot_en_cols

['make',
 'body-style',
 'drive-wheels',
 'engine-type',
 'num-of-cylinders',
 'fuel-system']

In [17]:
make_hot_enc=OneHotEncoder(drop='first',sparse_output=False)
make_hot_enc.fit_transform(df[['body-style']])

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],


# Piplline Implmentation


In [18]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [19]:
df=pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv',index_col=0)
df.replace('?',np.nan,inplace=True)
df.dropna(subset=['price'],inplace=True)

In [20]:
X,y=df.drop(columns='price'),df['price']

In [21]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
ord_cols = []
hot_encode_cols = []
for col in X[cat_cols]:
    count = X[col].nunique()
    if count <= 2:
        ord_cols.append(col)
    else:
        hot_encode_cols.append(col)
        print(f'{col},{count}')
print(num_cols)
print(ord_cols)
print(hot_encode_cols)

normalized-losses,51
make,22
body-style,5
drive-wheels,3
engine-type,6
num-of-cylinders,7
fuel-system,8
bore,38
stroke,36
horsepower,58
peak-rpm,22
['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']
['fuel-type', 'aspiration', 'num-of-doors', 'engine-location']
['normalized-losses', 'make', 'body-style', 'drive-wheels', 'engine-type', 'num-of-cylinders', 'fuel-system', 'bore', 'stroke', 'horsepower', 'peak-rpm']


In [22]:
num_cols=['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']
hot_encode_cols=['make', 'body-style','drive-wheels','num-of-cylinders', 'fuel-system']

# creating pipeline for preprocessing 


In [23]:
num_pipeline=Pipeline([
('imputer',SimpleImputer()),
('scaler',StandardScaler())
])
num_pipeline

In [24]:
ord_pipeline=Pipeline([
('imputer',SimpleImputer(strategy='most_frequent')),
('encode',OrdinalEncoder())
])
ord_pipeline

In [25]:
OneHotEncoder_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(drop='first',sparse=False)),
])
OneHotEncoder_pipeline


In [26]:
preprocessor=ColumnTransformer([
    ('ord', ord_pipeline, ord_cols),
    ('hot', OneHotEncoder_pipeline, hot_encode_cols),
    ('num', num_pipeline, num_cols),
])
preprocessor

In [27]:
preprocessor.fit_transform(X)



array([[ 1.        ,  0.        ,  1.        , ..., -0.29143464,
        -0.65224901, -0.54228772],
       [ 1.        ,  0.        ,  1.        , ..., -0.29143464,
        -0.65224901, -0.54228772],
       [ 1.        ,  0.        ,  1.        , ..., -0.29143464,
        -0.96439676, -0.689386  ],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.34149734,
        -1.12047063, -1.13068086],
       [ 0.        ,  1.        ,  0.        , ...,  3.2129542 ,
         0.12812034, -0.54228772],
       [ 1.        ,  1.        ,  0.        , ..., -0.1662779 ,
        -0.96439676, -0.83648429]])