##### Load libraries

In [1]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Handling missing data 
from sklearn.impute import SimpleImputer

# Encoding categorical Data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler

##### Load data

In [2]:
country_df_original = pd.read_csv('country.csv')
country_df = country_df_original.copy()
print(country_df)

Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [3]:
country_df.info()
print('Dataset shape: ', country_df.shape, sep="") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes
Dataset shape: (10, 4)


In [4]:
country_df.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [5]:
x_df = country_df.drop("Purchased", 1) 
x_df

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [6]:
y_df = country_df["Purchased"]
y_df

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [7]:
x = country_df.iloc[:,:-1].values
y = country_df.iloc[:,-1].values

In [8]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])
# In [:,1:3], the left side before the comma indicates to select all rows in data set
# x[:,1:3] = imputer.transform(x[:,1:3]).round(2)  # for rounding to 2 dp

In [9]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [10]:
# on a dataframe
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x_df[['Age']])
x_df['Age'] = imputer.transform(x_df[['Age']])
imputer = imputer.fit(x_df[['Salary']])
x_df['Salary'] = imputer.transform(x_df[['Salary']])
x_df

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


##### Applying one hot encoder

In [11]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [12]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

#### Applying label encoder

In [13]:
le = LabelEncoder() 
y = le.fit_transform(y)

In [14]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

##### Split the data into train & test

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [16]:
x_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [17]:
x_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [18]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [19]:
y_test

array([0, 1])

##### Feature Scaling

In [20]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [22]:
x_train

array([[-0.77459667, -0.57735027,  1.29099445, -0.19159184, -1.07812594],
       [-0.77459667,  1.73205081, -0.77459667, -0.01411729, -0.07013168],
       [ 1.29099445, -0.57735027, -0.77459667,  0.56670851,  0.63356243],
       [-0.77459667, -0.57735027,  1.29099445, -0.30453019, -0.30786617],
       [-0.77459667, -0.57735027,  1.29099445, -1.90180114, -1.42046362],
       [ 1.29099445, -0.57735027, -0.77459667,  1.14753431,  1.23265336],
       [-0.77459667,  1.73205081, -0.77459667,  1.43794721,  1.57499104],
       [ 1.29099445, -0.57735027, -0.77459667, -0.74014954, -0.56461943]])

In [21]:
x_test

array([[-0.77459667,  1.73205081, -0.77459667, -1.46618179, -0.9069571 ],
       [ 1.29099445, -0.57735027, -0.77459667, -0.44973664,  0.20564034]])

##### Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [27]:
y_pred

array([1.45562581, 1.06043181])

In [25]:
x_train.shape

(8, 5)

In [26]:
y_train.shape

(8,)