# Import Library & Dataset

In [181]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [182]:
# import the dataset
# create a dataframe by reading in the csv dataset
dataset = pd.read_csv('Data.csv')

# create dependent & independent variables
# use 'dot' so that can add functions iloc to loate indexes
x = dataset.iloc[:, :-1].values # .values to transfer dataframe into ndarray
y = dataset.iloc[:, -1].values

In [183]:
print (x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [184]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Missing Data

In [185]:
# deal the missing data
# replace the missing data by imputing the average
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')

# apply the object to the dataframe, fit here will calculate the mean first.
imputer.fit(x[:, 1:3])
# transfrom will replace the nan with mean and return the new dataset
x[:, 1:3] = imputer.transform(x[:, 1:3])

# Encoding Categorical Data

In [186]:
# Turn String into categorical data, pay attention to numerical order
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 2 arguments, 1st - specific form of transformation to do and index of the column, 2nd reminder, columns won't apply transformation
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [187]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [188]:
# label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [189]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting the dataset into training and testing set

In [190]:
# apply feature scaling AFTER splitting bc test is ground new set, apply before will cause information leakage
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [191]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [192]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [193]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [194]:
print(y_test)

[0 1]


# Feature Scaling

In [195]:
# for some models, avoid features dominated by other features
# standardization: (x - mean(x))/ std(x)     [-3. 3], most of the cases use standardization
# normalization: (x-min(x))/max(x) - min(x)   [0, 1], recommended when features follow normal distribution
# we are not allowed to fit the test set but only train, we can transform to the test set. By fitting the, we get the mean and std of the set.
from sklearn.preprocessing import StandardScaler
# do not apply standardization to dummy variables.
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [196]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [197]:
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
