# preprocessing
- ### imputation
    - missing data handling
        - numerical data imputation
            - mean value is filled in missing places
        - categorical data imputation
            - frequency (mode) value is filled in missing places
- ### encoding
    - change ur categorical(non numerical data) into numerical data
        - Label Encoding
            - if a column has 2 unique categorical values
        - One Hot Encoding
            - if a column has more than 2 categorical values
        - Vectorization
            - if you want to convert words to vector(numerical) format

- ### scaling/normalizaton
    - how to make the all the column to same scale so that ML algo can work properly

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
os.listdir('datasets')

['.ipynb_checkpoints', 'sample_data.csv']

In [4]:
os.path.exists('datasets/sample_data.csv')

True

In [5]:
df = pd.read_csv('datasets/sample_data.csv')

In [6]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000.0,35.0,yes
1,spain,10000.0,30.0,no
2,italy,230000.0,39.0,no
3,spain,200000.0,30.0,yes
4,italy,300000.0,30.0,yes
5,spain,31000.0,23.0,no
6,germany,,34.0,yes
7,spain,400000.0,,yes
8,italy,200000.0,29.0,no
9,italy,340000.0,35.0,yes


IMPUTATION

In [7]:
SimpleImputer?

[1;31mInit signature:[0m
[0mSimpleImputer[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mmissing_values[0m[1;33m=[0m[0mnan[0m[1;33m,[0m[1;33m
[0m    [0mstrategy[0m[1;33m=[0m[1;34m'mean'[0m[1;33m,[0m[1;33m
[0m    [0mfill_value[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mcopy[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0madd_indicator[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Imputation transformer for completing missing values.

Read more in the :ref:`User Guide <impute>`.

.. versionadded:: 0.20
   `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`
   estimator which is now removed.

Parameters
----------
missing_values : int, float, str, np.nan or None, default=np.nan
    The placeholder for the missing values. All occurrences of
    `miss

In [8]:
imputer = SimpleImputer()

In [9]:
cols = ['salary','age']
df[cols] = imputer.fit_transform(df[cols]).astype(int)

In [10]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000,35,yes
1,spain,10000,30,no
2,italy,230000,39,no
3,spain,200000,30,yes
4,italy,300000,30,yes
5,spain,31000,23,no
6,germany,179000,34,yes
7,spain,400000,31,yes
8,italy,200000,29,no
9,italy,340000,35,yes


LABEL ENCODING

In [11]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000,35,yes
1,spain,10000,30,no
2,italy,230000,39,no
3,spain,200000,30,yes
4,italy,300000,30,yes
5,spain,31000,23,no
6,germany,179000,34,yes
7,spain,400000,31,yes
8,italy,200000,29,no
9,italy,340000,35,yes


In [12]:
df.nunique() # we select happy col for label encoding coz of 2 unique values

country     3
salary     10
age         8
happy       2
dtype: int64

In [13]:
hpyEncoder = LabelEncoder()
df['happy'] = hpyEncoder.fit_transform(df['happy'])

In [14]:
df

Unnamed: 0,country,salary,age,happy
0,germany,49000,35,1
1,spain,10000,30,0
2,italy,230000,39,0
3,spain,200000,30,1
4,italy,300000,30,1
5,spain,31000,23,0
6,germany,179000,34,1
7,spain,400000,31,1
8,italy,200000,29,0
9,italy,340000,35,1


ONE HOT ENCODING

In [15]:
OneHotEncoder?

[1;31mInit signature:[0m
[0mOneHotEncoder[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcategories[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mdrop[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msparse[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mdtype[0m[1;33m=[0m[1;33m<[0m[1;32mclass[0m [1;34m'numpy.float64'[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mhandle_unknown[0m[1;33m=[0m[1;34m'error'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Encode categorical features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
encoding scheme. This creates a binary column for each category and
returns a sparse matrix or dense array (depending on the ``sparse``
parame

In [18]:
countryHotEnc = OneHotEncoder(drop='first')
country_enc = countryHotEnc.fit_transform(df[['country']]).toarray() # 2 sqaure bracket for making data into 2d column

In [21]:
hot_enc_countrydf = pd.DataFrame(country_enc,columns=['italy','spain'])

In [22]:
pd.concat?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mconcat[0m[1;33m([0m[1;33m
[0m    [0mobjs[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mIterable[0m[1;33m[[0m[0mForwardRef[0m[1;33m([0m[1;34m'NDFrame'[0m[1;33m)[0m[1;33m][0m[1;33m,[0m [0mMapping[0m[1;33m[[0m[0mOptional[0m[1;33m[[0m[0mHashable[0m[1;33m][0m[1;33m,[0m [0mForwardRef[0m[1;33m([0m[1;34m'NDFrame'[0m[1;33m)[0m[1;33m][0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mjoin[0m[1;33m=[0m[1;34m'outer'[0m[1;33m,[0m[1;33m
[0m    [0mignore_index[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mkeys[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlevels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnames[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverify_integrity[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m

In [26]:
df = pd.concat([hot_enc_countrydf,df],axis=1)

In [27]:
df.drop(columns=['country'],axis=1,inplace=True)
df

Unnamed: 0,italy,spain,salary,age,happy
0,0.0,0.0,49000,35,1
1,0.0,1.0,10000,30,0
2,1.0,0.0,230000,39,0
3,0.0,1.0,200000,30,1
4,1.0,0.0,300000,30,1
5,0.0,1.0,31000,23,0
6,0.0,0.0,179000,34,1
7,0.0,1.0,400000,31,1
8,1.0,0.0,200000,29,0
9,1.0,0.0,340000,35,1


SCALING or NORMALIZATION

In [30]:
cols = df.columns.tolist()[:-1]
cols

['italy', 'spain', 'salary', 'age']

In [35]:
StandardScaler?

[1;31mInit signature:[0m [0mStandardScaler[0m[1;33m([0m[1;33m*[0m[1;33m,[0m [0mcopy[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m [0mwith_mean[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m [0mwith_std[0m[1;33m=[0m[1;32mTrue[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Standardize features by removing the mean and scaling to unit variance

The standard score of a sample `x` is calculated as:

    z = (x - u) / s

where `u` is the mean of the training samples or zero if `with_mean=False`,
and `s` is the standard deviation of the training samples or one if
`with_std=False`.

Centering and scaling happen independently on each feature by computing
the relevant statistics on the samples in the training set. Mean and
standard deviation are then stored to be used on later data using
:meth:`transform`.

Standardization of a dataset is a common requirement for many
machine learning estimators: they might behave badly if the
individual features do not more or le

In [32]:
scaling = StandardScaler()
df[cols] = scaling.fit_transform(df[cols])

In [33]:
df

Unnamed: 0,italy,spain,salary,age,happy
0,-0.755929,-0.755929,-1.009405,0.911147,1
1,-0.755929,1.322876,-1.312226,-0.227787,0
2,1.322876,-0.755929,0.395997,1.822294,0
3,-0.755929,1.322876,0.163058,-0.227787,1
4,1.322876,-0.755929,0.939523,-0.227787,1
5,-0.755929,1.322876,-1.149169,-1.822294,0
6,-0.755929,-0.755929,0.0,0.68336,1
7,-0.755929,1.322876,1.715988,0.0,1
8,1.322876,-0.755929,0.163058,-0.455573,0
9,1.322876,-0.755929,1.250109,0.911147,1


# THE END