In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("preprocessdata.csv")

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
#check if any null values are present
dataset.isnull().any()

Country       True
Age           True
Salary        True
Purchased    False
dtype: bool

In [5]:
#fill the missing values in Age cloumn with Mean
dataset['Age'].fillna(dataset['Age'].mean(),inplace = True)
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
#fill the missing value with Salary column with Median
dataset['Salary'].fillna(dataset['Salary'].median(),inplace = True)
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [7]:
#this will return the mode value of the data
dataset['Country'].mode()

0     France
1    Germany
2      Spain
dtype: object

In [8]:
#fill the first column with Mode which is categorical
dataset['Country'].fillna(dataset['Country'].mode()[0],inplace = True)
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [9]:
#verify if the missing values are filled
dataset.isnull().any()

Country      False
Age          False
Salary       False
Purchased    False
dtype: bool

In [10]:
#convert the textual data to numerical format
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#convert Country column to numerical format
dataset['Country'] = le.fit_transform(dataset['Country'])

In [11]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,61000.0,Yes
5,0,35.0,58000.0,Yes
6,2,38.777778,52000.0,No
7,0,48.0,79000.0,Yes
8,1,50.0,83000.0,No
9,0,37.0,67000.0,Yes


In [18]:
# seperate the dataset to inputs and outputs
x = dataset.iloc[:,0:3].values
y = dataset.iloc[:,3].values

In [19]:
#using one hot encoder. [0] represents the 1st column and is categorical and so that column has only been taken.
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder()
z = one.fit_transform(x[:,0:1]).toarray()
x = np.delete(x,0,axis=1)
x = np.concatenate((z,x),axis=1)

In [20]:
x

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [29]:
#split the data to train and set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state= 0)

In [30]:
#feature scaling for outlier removal
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [31]:
x_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757, -0.13108063],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.49810638],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.49431914],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.07486114],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.75648039],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.13108063],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  1.02242888],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.44567413]])

In [32]:
x_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.86513213],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.1759384 ]])