# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Data.csv")

In [3]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
df.shape

(10, 4)

In [5]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [6]:
df.tail()

Unnamed: 0,Country,Age,Salary,Purchased
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [7]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


# Finding null values in data

In [9]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [10]:
df['Age'].isna().sum()

1

# Handle Null Values

In [11]:
df.Age = df.Age.fillna(df.Age.median())

In [12]:
df['Age'].isna().sum()

0

In [13]:
df.Salary = df.Salary.fillna(df.Salary.median())

In [14]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
df.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

In [16]:
df.loc[df.Age>40]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No


In [17]:
df.loc[df.Salary>60000]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Rows & Columns

In [18]:
df.iloc[:,:]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
x = df.iloc[:,-1]

In [20]:
x

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [21]:
type(x)

pandas.core.series.Series

In [22]:
x = df.iloc[:,-1:]

In [23]:
x

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


In [24]:
type(x)

pandas.core.frame.DataFrame

# Find different Categories with count

In [25]:
df.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [26]:
df.Country.value_counts()

France     4
Spain      3
Germany    3
Name: Country, dtype: int64

In [27]:
df.Purchased.value_counts()

No     5
Yes    5
Name: Purchased, dtype: int64

# Label Encoding

In [28]:
from sklearn.preprocessing import LabelEncoder
from collections import Counter as count
le=LabelEncoder()
print('Before LE', count(df['Country']))
df['Country']=le.fit_transform(df['Country'])
print('After LE', count(df['Country']))

Before LE Counter({'France': 4, 'Spain': 3, 'Germany': 3})
After LE Counter({0: 4, 2: 3, 1: 3})


In [29]:
print('Before LE', count(df['Purchased']))
df['Purchased']=le.fit_transform(df['Purchased'])
print('After LE', count(df['Purchased']))

Before LE Counter({'No': 5, 'Yes': 5})
After LE Counter({0: 5, 1: 5})


In [30]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,61000.0,1
5,0,35.0,58000.0,1
6,2,38.0,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [31]:
df.shape

(10, 4)

# One Hot Encoding

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer([('on', OneHotEncoder(),[0])],remainder='passthrough')

In [33]:
df = ct.fit_transform(df)

In [34]:
df

array([[1.0e+00, 0.0e+00, 0.0e+00, 4.4e+01, 7.2e+04, 0.0e+00],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04, 1.0e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04, 0.0e+00],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 6.1e+04, 0.0e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+01, 6.1e+04, 1.0e+00],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.5e+01, 5.8e+04, 1.0e+00],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 5.2e+04, 0.0e+00],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.8e+01, 7.9e+04, 1.0e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04, 0.0e+00],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04, 1.0e+00]])

In [35]:
df.shape

(10, 6)

# Independent & Dependent

In [40]:
#independent feature
x = pd.DataFrame(df[:,0:5],columns=['France','Germany','Spain','Age','Salary'])

In [41]:
x

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,61000.0
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.0,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [42]:
#Dependent variable
y = pd.DataFrame(df[:,5:],columns=['Purchased'])

In [43]:
y

Unnamed: 0,Purchased
0,0.0
1,1.0
2,0.0
3,0.0
4,1.0
5,1.0
6,0.0
7,1.0
8,0.0
9,1.0


# Train, Test & Split

In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [45]:
df.shape

(10, 6)

In [46]:
x_train.shape

(8, 5)

In [47]:
x_test.shape

(2, 5)

In [48]:
y_train.shape

(8, 1)

In [49]:
y_test.shape

(2, 1)

# Feature Scaling

In [55]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
fs = ss.fit_transform(df)

In [56]:
fs

array([[ 1.22474487, -0.65465367, -0.65465367,  0.76973439,  0.77256767,
        -1.        ],
       [-0.81649658, -0.65465367,  1.52752523, -1.69922498, -1.40879986,
         1.        ],
       [-0.81649658,  1.52752523, -0.65465367, -1.26352627, -0.86345798,
        -1.        ],
       [-0.81649658, -0.65465367,  1.52752523, -0.10166303, -0.22722578,
        -1.        ],
       [-0.81649658,  1.52752523, -0.65465367,  0.18880278, -0.22722578,
         1.        ],
       [ 1.22474487, -0.65465367, -0.65465367, -0.53736175, -0.49989673,
         1.        ],
       [-0.81649658, -0.65465367,  1.52752523, -0.10166303, -1.04523861,
        -1.        ],
       [ 1.22474487, -0.65465367, -0.65465367,  1.35066601,  1.40879986,
         1.        ],
       [-0.81649658,  1.52752523, -0.65465367,  1.64113182,  1.77236112,
        -1.        ],
       [ 1.22474487, -0.65465367, -0.65465367, -0.24689594,  0.3181161 ,
         1.        ]])