<img src="https://miro.medium.com/max/1200/1*XgcF3ayEH2Q8JEbZx8D09Q.png" width="800px" height="400px" />

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## Reading the Dataset

In [23]:
dataset = pd.read_csv('sample_data.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Nigeria,18.0,15000.0,No


In [24]:
dataset.shape

(29, 4)

In [25]:
# viewing statistical info about dataset
dataset.describe()

Unnamed: 0,Age,Salary
count,27.0,28.0
mean,36.925926,53642.857143
std,8.757089,19216.532785
min,18.0,15000.0
25%,30.0,44750.0
50%,37.0,53000.0
75%,44.0,67000.0
max,50.0,83000.0


In [28]:
dataset.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17     True
18    False
19    False
20    False
21    False
22     True
23     True
24    False
25    False
26     True
27    False
28    False
dtype: bool

In [29]:
# number of duplicates in the dataset
sum(dataset.duplicated())

4

In [31]:
dataset.drop_duplicates(inplace=True)

In [32]:
dataset.shape

(25, 4)

In [33]:
dataset.describe()

Unnamed: 0,Age,Salary
count,23.0,24.0
mean,36.782609,51541.666667
std,8.852101,19352.517344
min,18.0,15000.0
25%,30.0,43750.0
50%,37.0,51000.0
75%,44.0,62500.0
max,50.0,83000.0


In [35]:
# checking for null values
dataset.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,True,False
6,False,False,False,False
7,False,True,False,False
8,False,False,False,False
9,False,False,False,False


In [37]:
# counting total number of null values in each column
dataset.isnull().sum()

Country      1
Age          2
Salary       1
Purchased    1
dtype: int64

In [38]:
dataset["Country"].unique()

array(['France', 'Spain', 'Germany', 'Nigeria', nan], dtype=object)

In [39]:
dataset["Country"].nunique()

4

In [41]:
# Dropping categorical data rows with missing values
dataset.dropna(how='any', subset=['Country', 'Purchased'], inplace=True)

In [42]:
dataset["Country"].unique()

array(['France', 'Spain', 'Germany', 'Nigeria'], dtype=object)

In [43]:
dataset.isnull().sum()

Country      0
Age          2
Salary       1
Purchased    0
dtype: int64

In [44]:
# one hot encoding
dataset = pd.get_dummies(dataset, columns=["Country"])
dataset

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Nigeria,Country_Spain
0,44.0,72000.0,No,1,0,0,0
1,27.0,48000.0,Yes,0,0,0,1
2,30.0,54000.0,No,0,1,0,0
3,38.0,61000.0,No,0,0,0,1
4,18.0,15000.0,No,0,0,1,0
5,40.0,,Yes,0,1,0,0
6,35.0,58000.0,Yes,1,0,0,0
7,,52000.0,No,0,0,0,1
8,48.0,79000.0,Yes,1,0,0,0
9,50.0,83000.0,No,0,1,0,0


In [46]:
X = dataset.drop(columns=["Purchased"])
y = dataset['Purchased'].values

In [47]:
# replacing the missing values in the age & salary column with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[["Age", "Salary"]])

SimpleImputer()

In [48]:
X[["Age", "Salary"]] = imputer.transform(X[["Age", "Salary"]])

In [49]:
X

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Nigeria,Country_Spain
0,44.0,72000.0,1,0,0,0
1,27.0,48000.0,0,0,0,1
2,30.0,54000.0,0,1,0,0
3,38.0,61000.0,0,0,0,1
4,18.0,15000.0,0,0,1,0
5,40.0,50636.363636,0,1,0,0
6,35.0,58000.0,1,0,0,0
7,35.952381,52000.0,0,0,0,1
8,48.0,79000.0,1,0,0,0
9,50.0,83000.0,0,1,0,0


In [50]:
X.isnull().sum()

Age                0
Salary             0
Country_France     0
Country_Germany    0
Country_Nigeria    0
Country_Spain      0
dtype: int64

In [51]:
y

array(['No', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
       'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
       'Yes', 'No', 'Yes', 'No'], dtype=object)

In [52]:
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0]


In [54]:
y

array([0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0])