<a href="https://colab.research.google.com/github/avi78/AI_ML/blob/main/Data_Preprocessing_Tools_Dataset_1NT21CS024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data_set.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(X)

[['Korea' 31.0 72000.0]
 ['India' 23.0 48000.0]
 ['Germany' 43.0 54000.0]
 ['Spain' 29.0 61000.0]
 ['Spain' 51.0 61000.0]
 ['Germany' 40.0 nan]
 ['Spain' nan 52000.0]
 ['France' 25.0 79000.0]
 ['Germany' 53.0 83000.0]
 ['France' 42.0 67000.0]
 ['France' 39.0 61000.0]
 ['Spain' nan 61000.0]
 ['Spain' 51.0 nan]
 ['Spain' 54.0 nan]
 ['Germany' 30.0 83000.0]
 ['India' 44.0 48000.0]
 ['Germany' 48.0 54000.0]
 ['Germany' 41.0 55000.0]
 ['India' 35.0 48000.0]]


In [4]:
print(y)

['Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No'
 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes']


## Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [6]:
print(X)

[['Korea' 31.0 72000.0]
 ['India' 23.0 48000.0]
 ['Germany' 43.0 54000.0]
 ['Spain' 29.0 61000.0]
 ['Spain' 51.0 61000.0]
 ['Germany' 40.0 61687.5]
 ['Spain' 39.94117647058823 52000.0]
 ['France' 25.0 79000.0]
 ['Germany' 53.0 83000.0]
 ['France' 42.0 67000.0]
 ['France' 39.0 61000.0]
 ['Spain' 39.94117647058823 61000.0]
 ['Spain' 51.0 61687.5]
 ['Spain' 54.0 61687.5]
 ['Germany' 30.0 83000.0]
 ['India' 44.0 48000.0]
 ['Germany' 48.0 54000.0]
 ['Germany' 41.0 55000.0]
 ['India' 35.0 48000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [8]:
print(X)

[[0.0 0.0 0.0 1.0 0.0 31.0 72000.0]
 [0.0 0.0 1.0 0.0 0.0 23.0 48000.0]
 [0.0 1.0 0.0 0.0 0.0 43.0 54000.0]
 [0.0 0.0 0.0 0.0 1.0 29.0 61000.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 61000.0]
 [0.0 1.0 0.0 0.0 0.0 40.0 61687.5]
 [0.0 0.0 0.0 0.0 1.0 39.94117647058823 52000.0]
 [1.0 0.0 0.0 0.0 0.0 25.0 79000.0]
 [0.0 1.0 0.0 0.0 0.0 53.0 83000.0]
 [1.0 0.0 0.0 0.0 0.0 42.0 67000.0]
 [1.0 0.0 0.0 0.0 0.0 39.0 61000.0]
 [0.0 0.0 0.0 0.0 1.0 39.94117647058823 61000.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 61687.5]
 [0.0 0.0 0.0 0.0 1.0 54.0 61687.5]
 [0.0 1.0 0.0 0.0 0.0 30.0 83000.0]
 [0.0 0.0 1.0 0.0 0.0 44.0 48000.0]
 [0.0 1.0 0.0 0.0 0.0 48.0 54000.0]
 [0.0 1.0 0.0 0.0 0.0 41.0 55000.0]
 [0.0 0.0 1.0 0.0 0.0 35.0 48000.0]]


### Encoding the Dependent Variable

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(y)

[1 1 1 1 0 0 0 1 0 0 1 0 0 1 0 1 1 1 1]


## Splitting the dataset into the Training set and Test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [12]:
print(X_train)

[[0.0 1.0 0.0 0.0 0.0 43.0 54000.0]
 [0.0 1.0 0.0 0.0 0.0 30.0 83000.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 61000.0]
 [0.0 1.0 0.0 0.0 0.0 48.0 54000.0]
 [1.0 0.0 0.0 0.0 0.0 25.0 79000.0]
 [0.0 0.0 1.0 0.0 0.0 23.0 48000.0]
 [0.0 0.0 0.0 0.0 1.0 54.0 61687.5]
 [0.0 0.0 0.0 1.0 0.0 31.0 72000.0]
 [0.0 0.0 1.0 0.0 0.0 35.0 48000.0]
 [0.0 1.0 0.0 0.0 0.0 41.0 55000.0]
 [1.0 0.0 0.0 0.0 0.0 42.0 67000.0]
 [0.0 1.0 0.0 0.0 0.0 53.0 83000.0]
 [0.0 0.0 0.0 0.0 1.0 51.0 61687.5]
 [0.0 0.0 0.0 0.0 1.0 39.94117647058823 61000.0]
 [0.0 1.0 0.0 0.0 0.0 40.0 61687.5]]


In [13]:
print(X_test)

[[0.0 0.0 0.0 0.0 1.0 29.0 61000.0]
 [0.0 0.0 1.0 0.0 0.0 44.0 48000.0]
 [0.0 0.0 0.0 0.0 1.0 39.94117647058823 52000.0]
 [1.0 0.0 0.0 0.0 0.0 39.0 61000.0]]


In [14]:
print(y_train)

[1 0 0 1 1 1 1 1 1 1 0 0 0 0 0]


In [15]:
print(y_test)

[1 1 0 1]


## Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
print(X_train)

[[-0.39223227  1.22474487 -0.39223227 -0.26726124 -0.60302269  0.26246296
  -0.84297196]
 [-0.39223227  1.22474487 -0.39223227 -0.26726124 -0.60302269 -1.08230476
   1.77509356]
 [-0.39223227 -0.81649658 -0.39223227 -0.26726124  1.6583124   1.09001233
  -0.21102511]
 [-0.39223227  1.22474487 -0.39223227 -0.26726124 -0.60302269  0.77968132
  -0.84297196]
 [ 2.54950976 -0.81649658 -0.39223227 -0.26726124 -0.60302269 -1.59952311
   1.41398107]
 [-0.39223227 -0.81649658  2.54950976 -0.26726124 -0.60302269 -1.80641046
  -1.38464068]
 [-0.39223227 -0.81649658 -0.39223227 -0.26726124  1.6583124   1.40034334
  -0.1489589 ]
 [-0.39223227 -0.81649658 -0.39223227  3.74165739 -0.60302269 -0.97886109
   0.78203422]
 [-0.39223227 -0.81649658  2.54950976 -0.26726124 -0.60302269 -0.56508641
  -1.38464068]
 [-0.39223227  1.22474487 -0.39223227 -0.26726124 -0.60302269  0.05557562
  -0.75269383]
 [ 2.54950976 -0.81649658 -0.39223227 -0.26726124 -0.60302269  0.15901929
   0.33064362]
 [-0.39223227  1.2247

In [18]:
print(X_test)

[[-0.39223227 -0.81649658 -0.39223227 -0.26726124  1.6583124  -1.18574843
  -0.21102511]
 [-0.39223227 -0.81649658  2.54950976 -0.26726124 -0.60302269  0.36590663
  -1.38464068]
 [-0.39223227 -0.81649658 -0.39223227 -0.26726124  1.6583124  -0.05395297
  -1.0235282 ]
 [ 2.54950976 -0.81649658 -0.39223227 -0.26726124 -0.60302269 -0.15131172
  -0.21102511]]
