In [None]:
# In this file we will doing data pre-processing with sci-kit learn (sklearn)

In [25]:
import numpy as np
import pandas as pd

In [26]:
data = pd.read_csv('pre-process_datasample.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [27]:
data.info

<bound method DataFrame.info of    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8      NaN  50.0  83000.0        No
9   France  37.0  67000.0       Yes>

In [4]:
#Sklearn always expects your data to be in the form of numbers esspecially when you are going to create the ML model
# Rules when working with sklearn for creating ML model
# 1. Sklearn expects your data to be complete
# 2. Sklearn expects your data to be numeric
# 3. Sklearn expects your data to be in the form numpy array


# Sklearn do help in handling missing data, though one of the limitation sklearn has is you need to ensure the column
# which requires Imputation(Missing value analysis) must be a numeric column.
#
# Country column cant be handled by Sklearn.

In [28]:
#Handle missing data of country column using pandas
data[data.Country.isnull()]

Unnamed: 0,Country,Age,Salary,Purchased
8,,50.0,83000.0,No


In [29]:
data.Country.fillna(data.Country.mode()[0], inplace=True)

In [30]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [31]:
data.Purchased.replace(['No','Yes'],[0,1], inplace=True)

In [32]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,,1
5,France,35.0,58000.0,1
6,Spain,,52000.0,0
7,France,48.0,79000.0,1
8,France,50.0,83000.0,0
9,France,37.0,67000.0,1


In [33]:
#Sklearn expects your data to be in the form numpy array
# We need to convert DF into numpy array


data1 = data.values
data1

array([['France', 44.0, 72000.0, 0],
       ['Spain', 27.0, 48000.0, 1],
       ['Germany', 30.0, 54000.0, 0],
       ['Spain', 38.0, 61000.0, 0],
       ['Germany', 40.0, nan, 1],
       ['France', 35.0, 58000.0, 1],
       ['Spain', nan, 52000.0, 0],
       ['France', 48.0, 79000.0, 1],
       ['France', 50.0, 83000.0, 0],
       ['France', 37.0, 67000.0, 1]], dtype=object)

In [34]:
type(data1)

numpy.ndarray

In [35]:
#Dealing Missing values for Age column and Salary column
from sklearn.impute import SimpleImputer 

# Step1: Initialize the object
missingData = SimpleImputer(missing_values=np.nan, strategy='mean')

#Step2: Perform Calculation on the columns which needs to be processed
missingData.fit(data1[:,[1,2]])

# Step3: Apply the calc over the numpy array (Replace NaN with mean values of respective cols)
data1[:,[1,2]] = missingData.transform(data1[:,[1,2]])

data1

array([['France', 44.0, 72000.0, 0],
       ['Spain', 27.0, 48000.0, 1],
       ['Germany', 30.0, 54000.0, 0],
       ['Spain', 38.0, 61000.0, 0],
       ['Germany', 40.0, 63777.77777777778, 1],
       ['France', 35.0, 58000.0, 1],
       ['Spain', 38.77777777777778, 52000.0, 0],
       ['France', 48.0, 79000.0, 1],
       ['France', 50.0, 83000.0, 0],
       ['France', 37.0, 67000.0, 1]], dtype=object)

In [18]:
# here we are got some sklearn error in "catigories = [0]" 
# after found solution we will install this.
#pip install -U scikit-learn==0.22

Collecting scikit-learn==0.22
  Downloading scikit_learn-0.22-cp37-cp37m-win_amd64.whl (6.2 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.1
    Uninstalling scikit-learn-0.22.1:
      Successfully uninstalled scikit-learn-0.22.1
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'c:\\users\\anoop\\anaconda3\\lib\\site-packages\\~klearn\\decomposition\\_cdnmf_fast.cp37-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [36]:
import sklearn
print(sklearn.__version__)

0.22.1


In [37]:
# Handle Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [38]:
le = LabelEncoder()
data1[:,0] = le.fit_transform(data1[:,0])
data1

array([[0, 44.0, 72000.0, 0],
       [2, 27.0, 48000.0, 1],
       [1, 30.0, 54000.0, 0],
       [2, 38.0, 61000.0, 0],
       [1, 40.0, 63777.77777777778, 1],
       [0, 35.0, 58000.0, 1],
       [2, 38.77777777777778, 52000.0, 0],
       [0, 48.0, 79000.0, 1],
       [0, 50.0, 83000.0, 0],
       [0, 37.0, 67000.0, 1]], dtype=object)

In [40]:
ct = ColumnTransformer([('data1', OneHotEncoder(), [0])], remainder='passthrough')
data1 = ct.fit_transform(data1)
data1

array([[1.0, 0.0, 0.0, 44.0, 72000.0, 0],
       [0.0, 0.0, 1.0, 27.0, 48000.0, 1],
       [0.0, 1.0, 0.0, 30.0, 54000.0, 0],
       [0.0, 0.0, 1.0, 38.0, 61000.0, 0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778, 1],
       [1.0, 0.0, 0.0, 35.0, 58000.0, 1],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0, 0],
       [1.0, 0.0, 0.0, 48.0, 79000.0, 1],
       [1.0, 0.0, 0.0, 50.0, 83000.0, 0],
       [1.0, 0.0, 0.0, 37.0, 67000.0, 1]], dtype=object)