In [1]:
# Importing the libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset

In [7]:
dataset = pd.read_csv('data.csv') # to import the dataset into a variable

In [8]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
from sklearn.impute import SimpleImputer  # used for handling missing data

# 'np.nan' signifies that we are targeting missing values
# and the strategy we are choosing is replacing it with 'mean'

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(dataset.iloc[:, 1:3])
dataset.iloc[:, 1:3] = imputer.transform(dataset.iloc[:, 1:3])  


In [11]:
# print the dataset
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder  #OneHot Encoding consists of turning the country column into three separate columns, each column consists of 0s and 1s. 

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# [0] signifies the index of the column we are appliying the encoding on
data = pd.DataFrame(ct.fit_transform(dataset))


In [13]:
data

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,44.0,72000.0,No
1,0,0,1,27.0,48000.0,Yes
2,0,1,0,30.0,54000.0,No
3,0,0,1,38.0,61000.0,No
4,0,1,0,40.0,63777.8,Yes
5,1,0,0,35.0,58000.0,Yes
6,0,0,1,38.7778,52000.0,No
7,1,0,0,48.0,79000.0,Yes
8,0,1,0,50.0,83000.0,No
9,1,0,0,37.0,67000.0,Yes


In [14]:
from sklearn.preprocessing import LabelEncoder # used for encoding categorical data

le = LabelEncoder()
data.iloc[:,-1] = le.fit_transform(data.iloc[:,-1])
# 'data.iloc[:,-1]' is used to select the column that we need to be encoded


In [15]:
data

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,44.0,72000.0,0
1,0,0,1,27.0,48000.0,1
2,0,1,0,30.0,54000.0,0
3,0,0,1,38.0,61000.0,0
4,0,1,0,40.0,63777.8,1
5,1,0,0,35.0,58000.0,1
6,0,0,1,38.7778,52000.0,0
7,1,0,0,48.0,79000.0,1
8,0,1,0,50.0,83000.0,0
9,1,0,0,37.0,67000.0,1


In [18]:
from sklearn.preprocessing import MinMaxScaler  
# When we normalize the dataset it brings the value of all the features between 0 and 1 
# so that all the columns are in the same range

scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data))

In [16]:
data

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,44.0,72000.0,0
1,0,0,1,27.0,48000.0,1
2,0,1,0,30.0,54000.0,0
3,0,0,1,38.0,61000.0,0
4,0,1,0,40.0,63777.8,1
5,1,0,0,35.0,58000.0,1
6,0,0,1,38.7778,52000.0,0
7,1,0,0,48.0,79000.0,1
8,0,1,0,50.0,83000.0,0
9,1,0,0,37.0,67000.0,1


In [19]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
# .values function coverts the data into arrays

print("Independent Variable\n")
print(X)
print("\nDependent Variable\n")
print(y)

Independent Variable

[[1.         0.         0.         0.73913043 0.68571429]
 [0.         0.         1.         0.         0.        ]
 [0.         1.         0.         0.13043478 0.17142857]
 [0.         0.         1.         0.47826087 0.37142857]
 [0.         1.         0.         0.56521739 0.45079365]
 [1.         0.         0.         0.34782609 0.28571429]
 [0.         0.         1.         0.51207729 0.11428571]
 [1.         0.         0.         0.91304348 0.88571429]
 [0.         1.         0.         1.         1.        ]
 [1.         0.         0.         0.43478261 0.54285714]]

Dependent Variable

[0. 1. 0. 0. 1. 1. 0. 1. 0. 1.]


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
#'test_size=0.2' means 20% test data and 80% train data

In [21]:
print("X_train\n")
print(X_train)
print("\nX_test\n")
print(X_test)
print("y_train\n")
print(y_train)
print("\ny_test\n")
print(y_test)

X_train

[[0.         1.         0.         0.13043478 0.17142857]
 [0.         1.         0.         0.56521739 0.45079365]
 [1.         0.         0.         0.43478261 0.54285714]
 [0.         0.         1.         0.         0.        ]
 [1.         0.         0.         0.73913043 0.68571429]
 [0.         1.         0.         1.         1.        ]
 [0.         0.         1.         0.47826087 0.37142857]
 [1.         0.         0.         0.34782609 0.28571429]]

X_test

[[1.         0.         0.         0.91304348 0.88571429]
 [0.         0.         1.         0.51207729 0.11428571]]
y_train

[0. 1. 1. 1. 0. 0. 0. 1.]

y_test

[1. 0.]
