# Data Preprocessing

## Import Packages

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

## Load Data

In [2]:
dataset = pd.read_csv('DataSet.csv')

## Splitting the Data

In [3]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,3].values

## Preprocessing with Imputer

In [4]:
from sklearn.preprocessing import Imputer

## Handle Missing Value

## Mean Strategy

In [5]:
imputer = Imputer(missing_values="NaN",strategy='mean',axis=0)



## Fit into Imputer

In [6]:
imputer = imputer.fit(X[:,1:3])

## Transform the Data

In [7]:
X[:,1:3] = imputer.transform(X[:,1:3])

In [8]:
X

array([['Karachi', 42.0, 78000.0],
       ['Lahore', 32.0, 48000.0],
       ['Karachi', 36.0, 60000.0],
       ['Multan', 41.0, 68000.0],
       ['Lahore', 42.0, 68777.77777777778],
       ['Multan', 43.0, 59000.0],
       ['Karachi', 40.44444444444444, 59000.0],
       ['Lahore', 44.0, 79000.0],
       ['Multan', 52.0, 99000.0],
       ['Lahore', 32.0, 69000.0]], dtype=object)

In [9]:
XX = dataset.iloc[:,:-1].values

## Median Strategy

In [10]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN",strategy='median',axis=0)
imputer = imputer.fit(XX[:,1:3])
XX[:,1:3] = imputer.transform(XX[:,1:3])



In [11]:
XX

array([['Karachi', 42.0, 78000.0],
       ['Lahore', 32.0, 48000.0],
       ['Karachi', 36.0, 60000.0],
       ['Multan', 41.0, 68000.0],
       ['Lahore', 42.0, 68000.0],
       ['Multan', 43.0, 59000.0],
       ['Karachi', 42.0, 59000.0],
       ['Lahore', 44.0, 79000.0],
       ['Multan', 52.0, 99000.0],
       ['Lahore', 32.0, 69000.0]], dtype=object)

## Most Frequent Strategy

In [12]:
XXX = dataset.iloc[:,:-1].values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN",strategy='most_frequent',axis=0)
imputer = imputer.fit(XXX[:,1:3])
XXX[:,1:3] = imputer.transform(XXX[:,1:3])



In [13]:
XXX

array([['Karachi', 42.0, 78000.0],
       ['Lahore', 32.0, 48000.0],
       ['Karachi', 36.0, 60000.0],
       ['Multan', 41.0, 68000.0],
       ['Lahore', 42.0, 59000.0],
       ['Multan', 43.0, 59000.0],
       ['Karachi', 32.0, 59000.0],
       ['Lahore', 44.0, 79000.0],
       ['Multan', 52.0, 99000.0],
       ['Lahore', 32.0, 69000.0]], dtype=object)

## Dummy Data with OneHotEncoder

In [14]:
X = dataset.iloc[:,:-1].values
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])

In [15]:
X

array([[0, 42.0, 78000.0],
       [1, 32.0, 48000.0],
       [0, 36.0, 60000.0],
       [2, 41.0, 68000.0],
       [1, 42.0, nan],
       [2, 43.0, 59000.0],
       [0, nan, 59000.0],
       [1, 44.0, 79000.0],
       [2, 52.0, 99000.0],
       [1, 32.0, 69000.0]], dtype=object)

In [16]:
X = dataset.iloc[:,:-1].values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN",strategy='median',axis=0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])



In [17]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])

In [18]:
X

array([[0, 42.0, 78000.0],
       [1, 32.0, 48000.0],
       [0, 36.0, 60000.0],
       [2, 41.0, 68000.0],
       [1, 42.0, 68000.0],
       [2, 43.0, 59000.0],
       [0, 42.0, 59000.0],
       [1, 44.0, 79000.0],
       [2, 52.0, 99000.0],
       [1, 32.0, 69000.0]], dtype=object)

In [19]:
onehotencoder = OneHotEncoder(categorical_features=[0])
X=onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [21]:
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 4.2e+01, 7.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.2e+01, 4.8e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+01, 6.0e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.1e+01, 6.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.2e+01, 6.8e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.3e+01, 5.9e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.2e+01, 5.9e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.4e+01, 7.9e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 5.2e+01, 9.9e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.2e+01, 6.9e+04]])

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [23]:
y_test

array(['No', 'No'], dtype=object)

## Convert String Data into Numeric Form

In [24]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_test = sc_X.fit_transform(X_test)
X_train= sc_X.fit_transform(X_train)

In [25]:
X_train

array([[-0.57735027,  1.        , -0.57735027,  0.49468474,  0.20628425],
       [-0.57735027,  1.        , -0.57735027, -1.7039141 ,  0.30942637],
       [-0.57735027,  1.        , -0.57735027, -1.7039141 , -1.85655824],
       [ 1.73205081, -1.        , -0.57735027,  0.49468474, -0.72199487],
       [-0.57735027,  1.        , -0.57735027,  0.93440451,  1.34084762],
       [-0.57735027, -1.        ,  1.73205081,  0.27482485,  0.20628425],
       [ 1.73205081, -1.        , -0.57735027,  0.49468474,  1.2377055 ],
       [-0.57735027, -1.        ,  1.73205081,  0.71454462, -0.72199487]])