#Data.csv

# Step 1: Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Step 2: Importing dataset

In [5]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Step 3: Handling the missing data

In [4]:
df.nunique()


Country      3
Age          9
Salary       9
Purchased    2
dtype: int64

In [6]:
df.Salary.unique()

array([72000., 48000., 54000., 61000.,    nan, 58000., 52000., 79000.,
       83000., 67000.])

In [7]:
df.Age.unique()

array([44., 27., 30., 38., 40., 35., nan, 48., 50., 37.])

In [8]:
df['Age'].replace(np.nan , df['Age'].mean(), inplace = True)

In [9]:
df['Salary'].fillna(df['Salary'].mean(),inplace = True)


In [10]:
df 

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Step 4: Encoding categorical data

In [11]:
df.Country.unique() 

array(['France', 'Spain', 'Germany'], dtype=object)

In [12]:
labelen = LabelEncoder() 
df['Country'] = labelen.fit_transform(df['Country'])

# Step 5: Creating a dummy variable

In [13]:
df = pd.get_dummies(df)
df

Unnamed: 0,Country,Age,Salary,Purchased_No,Purchased_Yes
0,0,44.0,72000.0,1,0
1,2,27.0,48000.0,0,1
2,1,30.0,54000.0,1,0
3,2,38.0,61000.0,1,0
4,1,40.0,63777.777778,0,1
5,0,35.0,58000.0,0,1
6,2,38.777778,52000.0,1,0
7,0,48.0,79000.0,0,1
8,1,50.0,83000.0,1,0
9,0,37.0,67000.0,0,1


# Step 6: Splitting the datasets into training sets and Test sets

In [14]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [15]:
Xtrain,Xtest, ytrain,ytest = train_test_split(X,y , test_size= 0.2, random_state=2)

# Step 7: Feature Scaling

In [16]:
ss = StandardScaler()
Xtrain = ss.fit_transform(Xtrain)
Xtest = ss.transform(Xtest)

In [17]:
Xtrain


array([[-0.90453403, -0.80667524, -0.72132045, -1.29099445],
       [-0.90453403,  0.61764507,  0.58171004,  0.77459667],
       [-0.90453403,  1.25067632,  1.23322529, -1.29099445],
       [ 0.30151134, -1.5979643 , -1.09361488,  0.77459667],
       [ 1.50755672, -0.3319018 , -0.44209963,  0.77459667],
       [ 1.50755672, -0.20881239, -1.27976209,  0.77459667],
       [-0.90453403, -0.49015961,  0.11634201, -1.29099445],
       [ 0.30151134,  1.56719195,  1.60551972,  0.77459667]])

In [18]:
Xtest

array([[ 0.30151134, -0.01538618, -0.18356184, -1.29099445],
       [ 1.50755672, -2.07273774, -1.65205652, -1.29099445]])