# preprocessing
-  data cleaning
    - fixing missing values
        - imputation -When we have missing values in a feature column
        - removal - When we have missing values in Y(target) column we remove the rows

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [3]:
df=pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/sample_data.csv')
df.head()

Unnamed: 0,country,salary,age,happy
0,germany,49000.0,35.0,yes
1,spain,10000.0,,no
2,italy,230000.0,39.0,no
3,spain,200000.0,30.0,yes
4,italy,,30.0,yes


1. import the class from scikit
2. Create object of the class
3. Pass data to object of the class => fit() , fit_transform()
4. get the transformed data => transform()

In [7]:
from sklearn.impute import SimpleImputer 

In [5]:
df.isna().sum()

country    0
salary     2
age        2
happy      0
dtype: int64

In [6]:
cols= ['salary','age']
df[cols]

Unnamed: 0,salary,age
0,49000.0,35.0
1,10000.0,
2,230000.0,39.0
3,200000.0,30.0
4,,30.0
5,31000.0,23.0
6,,34.0
7,400000.0,
8,200000.0,29.0
9,340000.0,35.0


In [9]:
imputer = SimpleImputer()
imputer.fit_transform(df[cols])


array([[4.90000000e+04, 3.50000000e+01],
       [1.00000000e+04, 3.11111111e+01],
       [2.30000000e+05, 3.90000000e+01],
       [2.00000000e+05, 3.00000000e+01],
       [1.65555556e+05, 3.00000000e+01],
       [3.10000000e+04, 2.30000000e+01],
       [1.65555556e+05, 3.40000000e+01],
       [4.00000000e+05, 3.11111111e+01],
       [2.00000000e+05, 2.90000000e+01],
       [3.40000000e+05, 3.50000000e+01],
       [3.00000000e+04, 2.50000000e+01]])

In [10]:
df[cols] = imputer.fit_transform(df[cols])
df

Unnamed: 0,country,salary,age,happy
0,germany,49000.0,35.0,yes
1,spain,10000.0,31.111111,no
2,italy,230000.0,39.0,no
3,spain,200000.0,30.0,yes
4,italy,165555.555556,30.0,yes
5,spain,31000.0,23.0,no
6,germany,165555.555556,34.0,yes
7,spain,400000.0,31.111111,yes
8,italy,200000.0,29.0,no
9,italy,340000.0,35.0,yes


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
h_encoder= LabelEncoder()
df['happy']= h_encoder.fit_transform(df['happy'])
df

Unnamed: 0,country,salary,age,happy
0,germany,49000.0,35.0,1
1,spain,10000.0,31.111111,0
2,italy,230000.0,39.0,0
3,spain,200000.0,30.0,1
4,italy,165555.555556,30.0,1
5,spain,31000.0,23.0,0
6,germany,165555.555556,34.0,1
7,spain,400000.0,31.111111,1
8,italy,200000.0,29.0,0
9,italy,340000.0,35.0,1


In [13]:
h_encoder.classes_

array(['no', 'yes'], dtype=object)

In [14]:
h_encoder.transform(['no'])

array([0])

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
df['country'].values.reshape(-1,1).shape       # Will not use this


(11, 1)

In [21]:
df[['country']].shape       # this is better

(11, 1)

In [24]:
country_encoder = OneHotEncoder(drop='first')
dumy_countries = country_encoder.fit_transform(df[['country']]).toarray()
dumy_countries

array([[0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 0.]])

In [25]:
# Drop the encoded column 
df.drop(columns=['country'],inplace=True)
df.head()

Unnamed: 0,salary,age,happy
0,49000.0,35.0,1
1,10000.0,31.111111,0
2,230000.0,39.0,0
3,200000.0,30.0,1
4,165555.555556,30.0,1


In [26]:
pd.DataFrame(dumy_countries)

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
5,0.0,1.0
6,0.0,0.0
7,0.0,1.0
8,1.0,0.0
9,1.0,0.0


In [29]:
# add the dummy columns
df= pd.concat([df,pd.DataFrame(dumy_countries)],axis=1)
df

Unnamed: 0,salary,age,happy,0,1,0.1,1.1
0,49000.0,35.0,1,0.0,0.0,0.0,0.0
1,10000.0,31.111111,0,0.0,1.0,0.0,1.0
2,230000.0,39.0,0,1.0,0.0,1.0,0.0
3,200000.0,30.0,1,0.0,1.0,0.0,1.0
4,165555.555556,30.0,1,1.0,0.0,1.0,0.0
5,31000.0,23.0,0,0.0,1.0,0.0,1.0
6,165555.555556,34.0,1,0.0,0.0,0.0,0.0
7,400000.0,31.111111,1,0.0,1.0,0.0,1.0
8,200000.0,29.0,0,1.0,0.0,1.0,0.0
9,340000.0,35.0,1,1.0,0.0,1.0,0.0
