# Basic Machine Learning Data Preprocessing

In [1]:
#importing the libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the csv file using pandas library and head show only first five of the tabel in the dataset
dataset = pd.read_csv('sample.csv')
dataset.head()

Unnamed: 0,Names,Age,Salary,Interest
0,Ecil,23.0,10000.0,yes
1,Hel,21.0,8500.0,yes
2,Bhel,,9500.0,no
3,Ecil,24.0,,yes
4,Drdo,22.0,10500.0,no


In [3]:
# Spliting the dataset into X and y. Here X is Independent variable and y is Dependent variable
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# Imputer transforms for completing missing values

In [4]:
# Imputer transform for complete missing values with NaN with empty cell 
from sklearn.preprocessing import Imputer

In [5]:
#using parameters missing_values = 'NaN' and Strategy = 'Mean'
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])
imputer
X[:, 1:3] = imputer.transform(X[:, 1:3])



In [6]:
X

array([['Ecil', 23.0, 10000.0],
       ['Hel', 21.0, 8500.0],
       ['Bhel', 22.9375, 9500.0],
       ['Ecil', 24.0, 10411.764705882353],
       ['Drdo', 22.0, 10500.0],
       ['Drdo', 20.0, 10411.764705882353],
       ['Ecil', 22.9375, 8000.0],
       ['Hel', 22.0, 9000.0],
       ['Bhel', 19.0, 7500.0],
       ['Hel', 18.0, 7000.0],
       ['Ecil', 22.9375, 10411.764705882353],
       ['Drdo', 25.0, 11000.0],
       ['Hel', 21.0, 8500.0],
       ['Bhel', 22.9375, 9000.0],
       ['Bhel', 26.0, 13000.0],
       ['Drdo', 28.0, 16000.0],
       ['Ecil', 26.0, 15000.0],
       ['Ecil', 21.0, 8500.0],
       ['Ecil', 24.0, 10500.0],
       ['Drdo', 27.0, 15500.0]], dtype=object)

# Label Encode

In [7]:
# importing labelEncoder which is use to encode target features to number and give orderwise number
from sklearn.preprocessing import LabelEncoder

In [8]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [9]:
X

array([[2, 23.0, 10000.0],
       [3, 21.0, 8500.0],
       [0, 22.9375, 9500.0],
       [2, 24.0, 10411.764705882353],
       [1, 22.0, 10500.0],
       [1, 20.0, 10411.764705882353],
       [2, 22.9375, 8000.0],
       [3, 22.0, 9000.0],
       [0, 19.0, 7500.0],
       [3, 18.0, 7000.0],
       [2, 22.9375, 10411.764705882353],
       [1, 25.0, 11000.0],
       [3, 21.0, 8500.0],
       [0, 22.9375, 9000.0],
       [0, 26.0, 13000.0],
       [1, 28.0, 16000.0],
       [2, 26.0, 15000.0],
       [2, 21.0, 8500.0],
       [2, 24.0, 10500.0],
       [1, 27.0, 15500.0]], dtype=object)

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
onehotencode = OneHotEncoder(categorical_features= [0])
X = onehotencode.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [16]:
X_train

array([[-0.52223297, -0.63245553,  1.58113883, -0.52223297, -0.07666443,
        -1.02242689],
       [ 1.91485422, -0.63245553, -0.63245553, -0.52223297, -0.07666443,
        -0.61720191],
       [-0.52223297,  1.58113883, -0.63245553, -0.52223297, -0.45107213,
        -0.00936444],
       [ 1.91485422, -0.63245553, -0.63245553, -0.52223297, -0.07666443,
        -0.41458942],
       [-0.52223297,  1.58113883, -0.63245553, -0.52223297, -1.24980855,
        -0.04511959],
       [ 1.91485422, -0.63245553, -0.63245553, -0.52223297,  1.14640071,
         1.003698  ],
       [-0.52223297, -0.63245553, -0.63245553,  1.91485422, -2.04854497,
        -1.42765186],
       [-0.52223297, -0.63245553, -0.63245553,  1.91485422, -0.45107213,
        -0.61720191],
       [-0.52223297, -0.63245553,  1.58113883, -0.52223297,  1.14640071,
         1.81414796],
       [-0.52223297,  1.58113883, -0.63245553, -0.52223297,  0.7470325 ,
         0.19324805],
       [-0.52223297, -0.63245553,  1.58113883, -0.

In [17]:
X_test

array([[-0.52223297, -0.63245553,  1.58113883, -0.52223297,  0.34766429,
        -0.00936444],
       [-0.52223297, -0.63245553, -0.63245553,  1.91485422, -0.85044034,
        -0.8198144 ],
       [-0.52223297,  1.58113883, -0.63245553, -0.52223297,  1.54576892,
         2.01676045],
       [ 1.91485422, -0.63245553, -0.63245553, -0.52223297, -1.64917676,
        -1.22503938],
       [-0.52223297, -0.63245553,  1.58113883, -0.52223297, -0.07666443,
        -0.04511959],
       [-0.52223297, -0.63245553,  1.58113883, -0.52223297, -0.85044034,
        -0.8198144 ]])