# Logistic Regression
Predict the Purchased value (Dependent Variable) using the Age and Salary (Independent Variable).

In [1]:
# Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Import Dataset
dataset = pd.read_csv(r"Data Sets/User_Data.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [5]:
dataset.isnull()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
395,False,False,False,False,False
396,False,False,False,False,False
397,False,False,False,False,False
398,False,False,False,False,False


In [6]:
dataset.isnull().count()

User ID            400
Gender             400
Age                400
EstimatedSalary    400
Purchased          400
dtype: int64

In [7]:
# Extracting Independent and Dependent Variables
x = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [8]:
print(x[:10])

[[    19  19000]
 [    35  20000]
 [    26  43000]
 [    27  57000]
 [    19  76000]
 [    27  58000]
 [    27  84000]
 [    32 150000]
 [    25  33000]
 [    35  65000]]


In [9]:
print(y[:10])

[0 0 0 0 0 0 0 1 0 0]


In [10]:
# Split dataset into train-test ratio
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [11]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [12]:
print(x_train[:10])

[[ 1.92295008  2.14601566]
 [ 2.02016082  0.3787193 ]
 [-1.3822153  -0.4324987 ]
 [-1.18779381 -1.01194013]
 [ 1.92295008 -0.92502392]
 [ 0.36757813  0.29180308]
 [ 0.17315664  0.14694273]
 [ 2.02016082  1.74040666]
 [ 0.75642112 -0.83810771]
 [ 0.27036739 -0.28763835]]


In [13]:
print(x_test[:10])

[[-0.79895082  0.49460758]
 [-0.02126485 -0.57735906]
 [-0.31289709  0.14694273]
 [-0.79895082  0.26283101]
 [-0.31289709 -0.57735906]
 [-1.09058306 -1.44652121]
 [-0.70174008 -1.59138156]
 [-0.21568634  2.14601566]
 [-1.96547978 -0.05586178]
 [ 0.85363187 -0.78016356]]


In [14]:
# Fitting Logical Regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

In [15]:
# Predicting the test set result
y_pred = classifier.predict(x_test)

In [16]:
print(y_pred[:10])

[0 0 0 0 0 0 0 1 0 0]


In [17]:
# Testing Data Evaluation
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,roc_curve

In [18]:
# Accuracy Score
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy Score: ",acc_score)

Accuracy Score:  0.925


In [19]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[57  1]
 [ 5 17]]


In [20]:
clf_report = classification_report(y_test, y_pred)
print("Classification Report:\n",clf_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95        58
           1       0.94      0.77      0.85        22

    accuracy                           0.93        80
   macro avg       0.93      0.88      0.90        80
weighted avg       0.93      0.93      0.92        80



In [21]:
# Training Data Evaluation
acc_Score = accuracy_score(y_train, classifier.predict(x_train))
print("Accuracy Score: ",acc_Score)

Accuracy Score:  0.821875


In [22]:
# Creating the Confusion Matrix
cm = confusion_matrix(y_train, classifier.predict(x_train))
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[181  18]
 [ 39  82]]


In [23]:
# Classification Report
clf_report = classification_report(y_train, classifier.predict(x_train))
print("Classification Report:\n",clf_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       199
           1       0.82      0.68      0.74       121

    accuracy                           0.82       320
   macro avg       0.82      0.79      0.80       320
weighted avg       0.82      0.82      0.82       320

