## Step 1: Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Step 2: Import the data

In [2]:
df=pd.read_csv('Data.csv')
print(df)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


## Step 3: Check with missing data

In [3]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

## Step 4: Divide the data set into and output i.e feature matrix 

In [4]:
X=df.iloc[:,:-1].values
Y=df.iloc[:,-1].values

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Step 5: Replace the missing data

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imputer=SimpleImputer(missing_values=np.nan, strategy='mean')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [9]:
imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

In [10]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Step 6: converting categorical data to numeric values

## step a: Output Column Vector

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
Y=np.array(le.fit_transform(Y))

In [14]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


## step b: Feature Matrix

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough')
X=np.array(ct.fit_transform(X))

In [18]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [20]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


## step c: Building a classifier model using Logistic Regression

In [21]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2)
print(Xtrain)

[[1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 0.0 1.0 38.0 61000.0]]


In [22]:
# Step 1: Training the classification model
from sklearn.linear_model import LogisticRegression
LoR=LogisticRegression()
LoR.fit(Xtrain,Ytrain)