#  Preprocessing
- unwanted data 
     * columns
        *      unique values -> drop
 # dependent variable has missing entries
  * drop the whole row
# independent column missing data (imputation)
* more than 50% missing -> drop
# less than 50% missing
  - numerical data (mean, median, algo)
 * SimpleImputer from sklearn.impute package
 - categorical data (mode, algo)
 * SimpleImputer from sklearn.impute package
## text data (encoding)
 - Text -> Numbers
 - Label Encoding
 * Dependent Column -> LabelEncoder from sklearn.preprocessing package
 * Independent Column -> OrdinalEncoder from sklearn.preprocessing package
- One Hot Encoding
  * Dummy Variable = [0, 0, 1]
  *  OneHotEncoder from sklearn.preprocessing package
# normalisation
 * min-max
 * standard
- skewness
* log
 * box-cox


In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [7]:
pd.set_option('display.max_column',None)


In [8]:
df=pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv',index_col=0)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [9]:
df.replace('?',np.nan,inplace=True)
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [10]:
#  dropna: This is the method used to drop rows with missing values.
df.dropna(subset='price',inplace=True)

In [11]:
df.isnull().sum()


symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [None]:
#filtering data
col_missing=['normalized-losses','num-of-doors','stroke','bore','peak-rpm','horsepower']
df[col_missing]

In [12]:
missing_cols=df.isnull().sum()
missing_cols[missing_cols > 0 ].index.tolist()

['normalized-losses',
 'num-of-doors',
 'bore',
 'stroke',
 'horsepower',
 'peak-rpm']

In [None]:
num_si = SimpleImputer()
cat_si = SimpleImputer(strategy='most_frequent')
num_cols = ['normalized-losses','bore','stroke','horsepower','peak-rpm']
df[num_cols] = num_si.fit_transform(df[num_cols])

In [None]:
df[['num-of-doors']] = cat_si.fit_transform(df[['num-of-doors']])

In [None]:
df[['num-of-doors']].shape

In [None]:
cat_cols=df.select_dtypes(include=['object']).columns.tolist()[:-1]
df[cat_cols]                      

In [None]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder

In [None]:
ord_cols=[]
hot_en_cols=[]
for col in df[cat_cols]:
    count=df[col].nunique()
    if count<=2:
        ord_cols.append(col)
    else:
        hot_en_cols.append(col)
        print(f'{col},{count}')
print(ord_cols)
print(hot_en_cols)

In [None]:
ordEnc=OrdinalEncoder()
df[ord_cols]=ordEnc.fit_transform(df[ord_cols])
df[ord_cols]

In [None]:
hot_en_cols

In [None]:
make_hot_enc=OneHotEncoder(drop='first',sparse_output=False)
make_hot_enc.fit_transform(df[['body-style']])

# Piplline Implmentation


In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv',index_col=0)
df.replace('?',np.nan,inplace=True)
df.dropna(subset=['price'],inplace=True)

In [None]:
X,y=df.drop(columns='price'),df['price']

In [None]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
ord_cols = []
hot_encode_cols = []
for col in X[cat_cols]:
    count = X[col].nunique()
    if count <= 2:
        ord_cols.append(col)
    else:
        hot_encode_cols.append(col)
        print(f'{col},{count}')
print(num_cols)
print(ord_cols)
print(hot_encode_cols)

In [None]:
num_cols=['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']
hot_encode_cols=['make', 'body-style','drive-wheels','num-of-cylinders', 'fuel-system']

# creating pipeline for preprocessing 


In [None]:
num_pipeline=Pipeline([
('imputer',SimpleImputer()),
('scaler',StandardScaler())
])
num_pipeline

In [None]:
ord_pipeline=Pipeline([
('imputer',SimpleImputer(strategy='most_frequent')),
('encode',OrdinalEncoder())
])
ord_pipeline

In [None]:
OneHotEncoder_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(drop='first',sparse=False)),
])
OneHotEncoder_pipeline


In [None]:
preprocessor=ColumnTransformer([
    ('ord', ord_pipeline, ord_cols),
    ('hot', OneHotEncoder_pipeline, hot_encode_cols),
    ('num', num_pipeline, num_cols),
])
preprocessor

In [None]:
preprocessor.fit_transform(X)