## Importing libraries

In [45]:
import pandas as pd
import numpy as np


## Reading the data

In [46]:
df = pd.read_csv('data/country_car_sale.csv')
df.head(10)


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Adding the missing values

There are msising values in both Age and Salary column

In [47]:
missing_values = df.isnull().sum(0)
missing_values


Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [48]:
df.iloc[:, 1:3]


Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [49]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(
    missing_values=np.nan, 
    strategy='mean'
)
imputer.fit(df.iloc[:, 1:3])

df.iloc[:, 1:3] = imputer.transform(df.iloc[:, 1:3]).round()
df.head(10)


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Encoding categorical data

We have city which is string data, so using OneHotEncoder to transform the data

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [55]:
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), ['Country'])], 
    remainder='passthrough'
)
df_transformed = pd.DataFrame(ct.fit_transform(df))

X = df_transformed.iloc[:, :-1]
X.head()


Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63778.0


## Encoding dependent variable

In [56]:
from sklearn.preprocessing import LabelEncoder


In [64]:
le = LabelEncoder()
y = le.fit_transform(df_transformed.iloc[:, -1])
y


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the data

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


## Feature scaling

In [68]:
from sklearn.preprocessing import StandardScaler


In [73]:
sc = StandardScaler()
X_train.iloc[:, 3:] = sc.fit_transform(X_train.iloc[:, 3:])
X_test.iloc[:, 3:] = sc.fit_transform(X_test.iloc[:, 3:])
