# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# change path to datset location

In [2]:
import os

os.chdir("C:\\Users\\bhaskar\\python practice\\datasets")

# load dataset

In [3]:
df = pd.read_csv("Dataset.csv")

# view data as a data frame

In [4]:
df   

Unnamed: 0,Country,Age,Salary,Purchased
0,India,38.0,68000.0,No
1,France,43.0,45000.0,Yes
2,Germany,30.0,54000.0,No
3,France,48.0,65000.0,No
4,Germany,40.0,,Yes
5,India,35.0,58000.0,Yes
6,Germany,,53000.0,No
7,France,49.0,79000.0,Yes
8,India,50.0,88000.0,No
9,France,37.0,77000.0,yes


In [6]:
df.head()     #display first five rows

Unnamed: 0,Country,Age,Salary,Purchased
0,India,38.0,68000.0,No
1,France,43.0,45000.0,Yes
2,Germany,30.0,54000.0,No
3,France,48.0,65000.0,No
4,Germany,40.0,,Yes


# Extracting dependent and independent variables

# To extract an independent variable, we will use iloc[ ] method of Pandas library. 
# It is used to extract the required rows and columns from the dataset.
# In the below code, the first colon(:) is used to take all the rows, and the second colon(:) is for all the columns.
# Here we have used :-1, because we don't want to take the last column as it contains the dependent variable. 
# So by doing this, we will get the matrix of features.

In [7]:
x= df.iloc[:,:-1].values 

In [8]:
x

array([['India', 38.0, 68000.0],
       ['France', 43.0, 45000.0],
       ['Germany', 30.0, 54000.0],
       ['France', 48.0, 65000.0],
       ['Germany', 40.0, nan],
       ['India', 35.0, 58000.0],
       ['Germany', nan, 53000.0],
       ['France', 49.0, 79000.0],
       ['India', 50.0, 88000.0],
       ['France', 37.0, 77000.0]], dtype=object)

# Extracting dependent variable

In [9]:
y= df.iloc[:,3].values 

In [10]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'yes'],
      dtype=object)

# Handling Missing data

# There are mainly two ways to handle missing data, which are:

# By deleting the particular row: 
## The first way is used to commonly deal with null values. 
## In this way, we just delete the specific row or column which consists of null values. 
## But this way is not so efficient and removing data may lead to loss of information which will not give the accurate output.

# By calculating the mean: 
## In this way, we will calculate the mean of that column or row which contains
## any missing value and will put it on the place of missing value. 
# This strategy is useful for the features which have numeric data such as age, salary, year, etc. 


In [11]:
#handling missing data (Replacing missing data with the mean value)  
from sklearn.preprocessing import Imputer  
imputer= Imputer(missing_values ='NaN', strategy='mean', axis = 0)  



In [12]:
#Fitting imputer object to the independent variables x.   
imputerimputer= imputer.fit(x[:, 1:3])  
#Replacing missing data with the calculated mean value  
x[:, 1:3]= imputer.transform(x[:, 1:3])  

In [13]:
x

array([['India', 38.0, 68000.0],
       ['France', 43.0, 45000.0],
       ['Germany', 30.0, 54000.0],
       ['France', 48.0, 65000.0],
       ['Germany', 40.0, 65222.22222222222],
       ['India', 35.0, 58000.0],
       ['Germany', 41.111111111111114, 53000.0],
       ['France', 49.0, 79000.0],
       ['India', 50.0, 88000.0],
       ['France', 37.0, 77000.0]], dtype=object)

# Encoding Categorical data

## Since machine learning model completely works on mathematics and numbers, 
## but if our dataset would have a categorical variable, then it may create trouble while building the model. 
## So it is necessary to encode these categorical variables into numbers.

In [14]:
#Catgorical data  
#for Country Variable  
from sklearn.preprocessing import LabelEncoder  
label_encoder_x= LabelEncoder()  
x[:, 0]= label_encoder_x.fit_transform(x[:, 0])  

In [15]:
x

array([[2, 38.0, 68000.0],
       [0, 43.0, 45000.0],
       [1, 30.0, 54000.0],
       [0, 48.0, 65000.0],
       [1, 40.0, 65222.22222222222],
       [2, 35.0, 58000.0],
       [1, 41.111111111111114, 53000.0],
       [0, 49.0, 79000.0],
       [2, 50.0, 88000.0],
       [0, 37.0, 77000.0]], dtype=object)

# Dummy Variables:


# There  are three country variables, and as we can see in the above output, these variables are encoded into 0, 1, and 2.
# By these values, the machine learning model may assume that there is some correlation between these variables which will produce the wrong output. So to remove this issue, we will use dummy encoding.


# In our dataset, we have 3 categories so it will produce three columns having 0 and 1 values.
# For Dummy Encoding, we will use OneHotEncoder class of preprocessing library.

In [16]:
#for Country Variable  
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  
label_encoder_x= LabelEncoder()  
x[:, 0]= label_encoder_x.fit_transform(x[:, 0])  
#Encoding for dummy variables  
onehot_encoder= OneHotEncoder(categorical_features= [0])    
x= onehot_encoder.fit_transform(x).toarray()  

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [17]:
x

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.30000000e+01,
        4.50000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        6.50000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.52222222e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.11111111e+01,
        5.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.90000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 5.00000000e+01,
        8.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        7.70000000e+04]])

# For Purchased Variable:

In [18]:
labelencoder_y= LabelEncoder()  
y= labelencoder_y.fit_transform(y)  

In [19]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 2])

## For the second categorical variable, we will only use labelencoder object of LableEncoder class. 
## Here we are not using OneHotEncoder class because the purchased variable has only two categories yes or no, 
## and which are automatically encoded into 0 and 1.

# Splitting the Dataset into the Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state=0)  

In [21]:
x

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.30000000e+01,
        4.50000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        6.50000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.52222222e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.11111111e+01,
        5.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.90000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 5.00000000e+01,
        8.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        7.70000000e+04]])

In [22]:
x_train

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.52222222e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        7.70000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.30000000e+01,
        4.50000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.11111111e+01,
        5.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.90000000e+01,
        7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        6.50000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.50000000e+01,
        5.80000000e+04]])

In [23]:
x_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 5.0e+01, 8.8e+04]])

In [24]:
y_train

array([1, 2, 1, 0, 1, 0, 0, 1])

In [25]:
y_test

array([0, 0])

#  Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler 
st_x= StandardScaler()  
x_train= st_x.fit_transform(x_train)  
x_test= st_x.transform(x_test)  

In [27]:
x_train

array([[-1.        ,  1.73205081, -0.57735027, -0.29460737,  0.1339619 ],
       [ 1.        , -0.57735027, -0.57735027, -0.93095928,  1.22626663],
       [ 1.        , -0.57735027, -0.57735027,  0.34174455, -1.74150472],
       [-1.        ,  1.73205081, -0.57735027, -0.05892147, -0.99956188],
       [ 1.        , -0.57735027, -0.57735027,  1.61444837,  1.41175234],
       [ 1.        , -0.57735027, -0.57735027,  1.40233107,  0.11335238],
       [-1.        , -0.57735027,  1.73205081, -0.71884198,  0.39158094],
       [-1.        , -0.57735027,  1.73205081, -1.35519389, -0.5358476 ]])

In [28]:
x_test

array([[-1.        ,  1.73205081, -0.57735027, -2.41578041, -0.90681902],
       [-1.        , -0.57735027,  1.73205081,  1.82656568,  2.24643804]])