# Data Preprocessing
- Dealing with duplicate values
- Dealing with missing values
- Scaling
    - Standard Scaler
    - Min Max Scaler
- Dealing with categorical values
    - One hot encoding
    - Ordinal encoding
    - Label encoding
- Splitting data into training and test sets

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [45]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    11 non-null     object 
 1   Age        11 non-null     float64
 2   Salary     11 non-null     float64
 3   Purchased  12 non-null     object 
dtypes: float64(2), object(2)
memory usage: 512.0+ bytes


In [47]:
df.nunique()

Country       3
Age          10
Salary       10
Purchased     2
dtype: int64

In [48]:
print('Countries : ', df.Country.dropna().unique())
print('Purchased : ', df.Purchased.unique())

Countries :  ['France' 'Spain' 'Germany']
Purchased :  ['No' 'Yes']


# Dealing with duplicates

In [49]:
# Check for duplicates
df.duplicated().sum()

1

In [50]:
# Drop the duplicates
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# -----------------------------------------------------------------------
# Dealing with missing values
- If number of missing values in a column is large with respect to total records then we can drop the column
- If missing values are in numerical column then we can replace by mean or median
- If missing values are in categorical column then we can replace by mode

In [51]:
# Check for missing values
df.isnull().sum()

Country      1
Age          1
Salary       1
Purchased    0
dtype: int64

### Using Pandas

In [52]:
avg_age = df.Age.mean()
avg_salary = df.Salary.mean()
freq_country = df.Country.mode()[0]#[0]: Retrieves the first mode from the resulting Series. This is necessary because .mode() can return multiple values if there's a tie, but the code assumes there is at least one mode.

df.Age.replace(np.nan, avg_age, inplace = True)
df.Salary.replace(np.nan, avg_salary, inplace = True)
df.Country.replace(np.nan, freq_country, inplace = True)

df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Using Scikit Learn

In [53]:
df2 = pd.read_csv('Data.csv')
df2.drop_duplicates(inplace = True)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [54]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
df2[['Age','Salary']] = imputer.fit_transform(df2[['Age', 'Salary']])

df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [55]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
df2[['Country']] = imputer.fit_transform(df2[['Country']])

df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64400.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.4,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# ----------------------------------------------------------------------------
# Scaling
- Used to bring features to same scale
        1. Standaed Scaler
        2. Min Max Scaler

### Standard Scaler
- X_scaled = (X-X_mean)/X_std
- Performs z score transformation
- Zero mean
- Unit Variance

In [56]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

In [57]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,No
1,Spain,-1.667438,-1.545645,Yes
2,Germany,-1.294131,-0.980165,No
3,Spain,-0.298646,-0.320439,No
4,Germany,-0.049774,0.0,Yes
5,France,-0.671953,-0.603178,Yes
6,Spain,0.0,-1.168658,No
7,France,0.945711,1.376001,Yes
8,Germany,1.194582,1.752987,No
9,France,-0.423081,0.245041,Yes


In [58]:
df.Age.mean()

1.6148698540002277e-16

In [59]:
df.Age.var()

1.0999999999999996

### Min Max Sclaer
- X_scaled = (X - X_min) / (X_max - X_min)
- Limits the data between 0 to 1

In [60]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2[['Age', 'Salary']] = scaler.fit_transform(df2[['Age', 'Salary']])

df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.607143,0.685714,No
1,Spain,0.0,0.0,Yes
2,Germany,0.107143,0.171429,No
3,Spain,0.392857,0.371429,No
4,Germany,0.464286,0.468571,Yes
5,France,0.285714,0.285714,Yes
6,Spain,0.478571,0.114286,No
7,France,0.75,0.885714,Yes
8,Germany,0.821429,1.0,No
9,France,0.357143,0.542857,Yes


# ----------------------------------------------------------------------------
# Dealing with categorical data
- One Hot Encoding
- Label Encoding
- Ordinal Encoding 
- A dummy variable is a variable that takes on a value of either 0 or 1
to indicate the presence or absence of a categorical effect.
Dummy variables are also called indicator variables

In [61]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit_transform(df[['Country']]).toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [62]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df[['Country']])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0, 0])

In [63]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoder.fit_transform(df[['Country']])

array([[0.],
       [2.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [64]:
# Using pandas
df.Purchased = df.Purchased.map({'Yes' : 1, 'No' : 0})
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,0
1,Spain,-1.667438,-1.545645,1
2,Germany,-1.294131,-0.980165,0
3,Spain,-0.298646,-0.320439,0
4,Germany,-0.049774,0.0,1
5,France,-0.671953,-0.603178,1
6,Spain,0.0,-1.168658,0
7,France,0.945711,1.376001,1
8,Germany,1.194582,1.752987,0
9,France,-0.423081,0.245041,1


In [65]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.447968,0.716274,0
1,Spain,-1.667438,-1.545645,1
2,Germany,-1.294131,-0.980165,0
3,Spain,-0.298646,-0.320439,0
4,Germany,-0.049774,0.0,1
5,France,-0.671953,-0.603178,1
6,Spain,0.0,-1.168658,0
7,France,0.945711,1.376001,1
8,Germany,1.194582,1.752987,0
9,France,-0.423081,0.245041,1


In [35]:
X = df[['Country', 'Age', 'Salary']].values
Y = df[['Purchased']].values

In [36]:
X

array([['France', 0.4479683869155007, 0.7162744555037791],
       ['Spain', -1.6674378846299187, -1.5456448776660496],
       ['Germany', -1.2941308955336681, -0.9801650443735924],
       ['Spain', -0.2986455912770002, -0.3204385721990591],
       ['Germany', -0.04977426521283322, 0.0],
       ['France', -0.6719525803732507, -0.6031784888452876],
       ['Spain', 0.0, -1.1686583221377447],
       ['France', 0.9457110390438347, 1.3760009276783125],
       ['Germany', 1.1945823651080016, 1.7529874832066172],
       ['France', -0.42308125430908367, 0.2450412610933981],
       ['France', 1.816760680268419, 0.5277811777396266]], dtype=object)

In [37]:
Y

array([[0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1]], dtype=int64)

In [69]:
np.set_printoptions(linewidth = 200)
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(transformers = [('Encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = transformer.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 0.4479683869155007, 0.7162744555037791],
       [0.0, 0.0, 1.0, -1.6674378846299187, -1.5456448776660496],
       [0.0, 1.0, 0.0, -1.2941308955336681, -0.9801650443735924],
       [0.0, 0.0, 1.0, -0.2986455912770002, -0.3204385721990591],
       [0.0, 1.0, 0.0, -0.04977426521283322, 0.0],
       [1.0, 0.0, 0.0, -0.6719525803732507, -0.6031784888452876],
       [0.0, 0.0, 1.0, 0.0, -1.1686583221377447],
       [1.0, 0.0, 0.0, 0.9457110390438347, 1.3760009276783125],
       [0.0, 1.0, 0.0, 1.1945823651080016, 1.7529874832066172],
       [1.0, 0.0, 0.0, -0.42308125430908367, 0.2450412610933981],
       [1.0, 0.0, 0.0, 1.816760680268419, 0.5277811777396266]], dtype=object)

# Splitting the data into training and test sets

In [70]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

In [71]:
x_train

array([[0.0, 1.0, 0.0, -1.2941308955336681, -0.9801650443735924],
       [1.0, 0.0, 0.0, -0.6719525803732507, -0.6031784888452876],
       [0.0, 1.0, 0.0, -0.04977426521283322, 0.0],
       [0.0, 1.0, 0.0, 1.1945823651080016, 1.7529874832066172],
       [0.0, 0.0, 1.0, -1.6674378846299187, -1.5456448776660496],
       [0.0, 0.0, 1.0, 0.0, -1.1686583221377447],
       [0.0, 0.0, 1.0, -0.2986455912770002, -0.3204385721990591]], dtype=object)

In [72]:
y_train

array([[0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0]], dtype=int64)

In [73]:
x_test

array([[1.0, 0.0, 0.0, 0.9457110390438347, 1.3760009276783125],
       [1.0, 0.0, 0.0, 0.4479683869155007, 0.7162744555037791],
       [1.0, 0.0, 0.0, -0.42308125430908367, 0.2450412610933981],
       [1.0, 0.0, 0.0, 1.816760680268419, 0.5277811777396266]], dtype=object)

In [74]:
y_test

array([[1],
       [0],
       [1],
       [1]], dtype=int64)