In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn 


# DATA PREPROCESSING

In [6]:
# import dataset
df = pd.read_csv('Data.csv')
# assigning independent variables to X
X = df.iloc[:, :-1].values
# assigning dependent variables to y
y = df.iloc[:, -1].values

In [7]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [8]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [9]:
# taking care of missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [10]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
# Categorical data is when there are some data in a dataset which are not
# many so one can easily change them into some numbers for a machine learning 
# to work on it perfectly
# example in the above there are three countries in the independent variable
# They are Germany, France and Spain: 0, 1, 3
# that is the only way of dealing with categorical values
# we could use oneHotEncoder or labelcoder
# onehotencoder usual create new columns out of the dataset given

In [11]:
# encoding independent
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [12]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


# Encoding the Dependent Variable

In [13]:
# In dealing with dependent variable with deals with yes or no
# use labelencoder since it comes with text
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting the dataset into the Training set and Test set

Feature scaling is always applied after splitting the dataset
because we want to prevent information leakage across the entire dataset

In [16]:
# splitting data in order to have a train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Note the splitting carefully, the dataset is divided into two parts;

## training and testing

training dataset is divided into two x and y(that is x for independent variables and its corresponding dependent variables)

In the same vein, the test set is also divided into two parts
the X_test(independent variables) and its corresponding dependent variables

In [17]:
# independent variables
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [25]:
#dependent variables
print(y_train)

[0 1 0 0 1 1 0 1]


In [26]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [27]:
print(y_test)

[0 1]


# Feature Scaling

## This makes all features scale across board. That is to make sure some features are not dominated than others when using a machine learning model
Feature scaling is not used all the time with regard to machine learning models

## The most common feature scaling techniques are
1. Standardisation: this is x-mean(x) divided by standard deviation of x.

2. Normalisation: x-min(x) divided by max(x)-min(x)

## Most recommended is standardisation

0    44.0
1    27.0
2    30.0
3    38.0
4    40.0
5    35.0
6     NaN
7    48.0
8    50.0
9    37.0
Name: Age, dtype: float64

In [43]:
# Lets apply standardisation
# This is cannot be applied on dummy values
# this changes brings about a range from -3 to 3
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


## Note: fit = apply
## Transform = take the action or transform
## Both can be used at the same time
## that is fit_transform


In [55]:
# Only use the transform method to change the X-test in order
# to make it ready for testing the reason the fit method is 
# taken out
X_test[:, 3:] = sc.transform(X_test[:, 3:])
X_test


array([[0.0, 1.0, 0.0, -13.11078717201166, -9.309124480659081],
       [1.0, 0.0, 0.0, -13.064139941690962, -9.309124480651798]],
      dtype=object)

In [32]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes
