## Importing the libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

## Loading the dataset

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')

In [3]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


## Define X by selecting only the age and EstimatedSalary, and y with purchased column

In [6]:
X = dataset[['Age','EstimatedSalary']]
y = dataset['Purchased']

In [7]:
X.head()

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000


In [8]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

## Print count of each label in Purchased column

In [9]:
dataset['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

## Print Correlation of each feature in the dataset

In [10]:
dataset.corr()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
User ID,1.0,-0.000721,0.071097,0.00712
Age,-0.000721,1.0,0.155238,0.622454
EstimatedSalary,0.071097,0.155238,1.0,0.362083
Purchased,0.00712,0.622454,0.362083,1.0


# First: Logistic Regression model

## Split the dataset into Training set and Test set with test_size = 0.25 and random_state = 0

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [12]:
y_train

250    0
63     1
312    0
159    1
283    1
      ..
323    1
192    0
117    0
47     0
172    0
Name: Purchased, Length: 300, dtype: int64

## Train the model with random_state = 0

In [14]:
from sklearn import linear_model

In [15]:
logr = linear_model.LogisticRegression(random_state=0)
# training the model
logr.fit(X_train,y_train)

LogisticRegression(random_state=0)

## Print the prediction results

In [16]:
# testing the model 
logr = logr.predict(X_test)
logr

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

## Create dataframe with the Actual Purchased and Predict Purchased

In [17]:
Ac_pre_purchased=pd.DataFrame(y_test)
Ac_pre_purchased['Predict_Purchased'] = logr
Ac_pre_purchased

Unnamed: 0,Purchased,Predict_Purchased
132,0,0
309,0,0
341,0,0
196,0,0
246,0,0
...,...,...
146,1,0
135,0,0
390,1,0
264,1,0


In [18]:
(test_dataset['Actual_Purchased'] == test_dataset['Predict_Purchased']).value_counts()

NameError: name 'test_dataset' is not defined

## Print Confusion Matrix and classification_report

In [19]:
print(confusion_matrix(y_test,logr))

[[68  0]
 [32  0]]


In [20]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
print(classification_report(y_test,logr))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        68
           1       0.00      0.00      0.00        32

    accuracy                           0.68       100
   macro avg       0.34      0.50      0.40       100
weighted avg       0.46      0.68      0.55       100



In [21]:
plot_confusion_matrix(classifier, X=X_test, y_true=y_test, cmap='Blues');

NameError: name 'classifier' is not defined

## Use StandardScaler() to improved performance and re-train your model

In [31]:
from sklearn.preprocessing import StandardScaler

In [37]:
scaler = StandardScaler()
scaler.fit_transform(X_train, y_train)

array([[ 0.58164944, -0.88670699],
       [-0.60673761,  1.46173768],
       [-0.01254409, -0.5677824 ],
       [-0.60673761,  1.89663484],
       [ 1.37390747, -1.40858358],
       [ 1.47293972,  0.99784738],
       [ 0.08648817, -0.79972756],
       [-0.01254409, -0.24885782],
       [-0.21060859, -0.5677824 ],
       [-0.21060859, -0.19087153],
       [-0.30964085, -1.29261101],
       [-0.30964085, -0.5677824 ],
       [ 0.38358493,  0.09905991],
       [ 0.8787462 , -0.59677555],
       [ 2.06713324, -1.17663843],
       [ 1.07681071, -0.13288524],
       [ 0.68068169,  1.78066227],
       [-0.70576986,  0.56295021],
       [ 0.77971394,  0.35999821],
       [ 0.8787462 , -0.53878926],
       [-1.20093113, -1.58254245],
       [ 2.1661655 ,  0.93986109],
       [-0.01254409,  1.22979253],
       [ 0.18552042,  1.08482681],
       [ 0.38358493, -0.48080297],
       [-0.30964085, -0.30684411],
       [ 0.97777845, -0.8287207 ],
       [ 0.97777845,  1.8676417 ],
       [-0.01254409,

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test,logr))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        68
           1       0.00      0.00      0.00        32

    accuracy                           0.68       100
   macro avg       0.34      0.50      0.40       100
weighted avg       0.46      0.68      0.55       100



## Try to Predicting a new result - e.g: person with Age = 30 and Salary = 90,000

In [27]:
pred1 = scaler.transform([[30, 90000]])
pred1

array([[-0.80480212,  0.59194336]])

## Try to Predicting a new result - e.g: person with Age = 40 and Salary = 90,000

In [28]:
pred1 = scaler.transform([[40, 90000]])
pred1

array([[0.18552042, 0.59194336]])

# Second: k-nearest neighbors model

In [29]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
comp_result = pd.DataFrame(y_test)
comp_result['pred']= pred
comp_result
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.91      0.85      0.88        68
           1       0.72      0.81      0.76        32

    accuracy                           0.84       100
   macro avg       0.81      0.83      0.82       100
weighted avg       0.85      0.84      0.84       100

