In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
df = pd.read_csv('Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


### Preparing Dataset for Logistic Regression

In [4]:
def gender(sex):
    if sex == 'Male':
        return 0
    else:
        return 1

In [5]:
df['Gender'] = df['Gender'].apply(gender)

In [6]:
def age_years(age):
    if age < 18:
        return 0
    elif age < 60:
        return 1
    else:
        return 2

In [7]:
df['Age'] = df['Age'].apply(age_years)

In [8]:
def salary(amount):
    if amount < 10000:
        return 0
    elif amount < 20000:
        return 1
    elif amount < 30000:
        return 3
    elif amount < 40000:
        return 4
    elif amount < 50000:
        return 5
    else:
        return 6

In [9]:
df['EstimatedSalary'] = df['EstimatedSalary'].apply(salary)

In [10]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,0,1,1,0
1,15810944,0,1,3,0
2,15668575,1,1,5,0
3,15603246,1,1,6,0
4,15804002,0,1,6,0
...,...,...,...,...,...
395,15691863,1,1,5,1
396,15706071,0,1,3,1
397,15654296,1,1,3,1
398,15755018,0,1,4,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   User ID          400 non-null    int64
 1   Gender           400 non-null    int64
 2   Age              400 non-null    int64
 3   EstimatedSalary  400 non-null    int64
 4   Purchased        400 non-null    int64
dtypes: int64(5)
memory usage: 15.8 KB


### Now lets split the data for training

In [12]:
y = df['Purchased'] # dependent 
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [13]:
x = df.drop(['User ID', 'Purchased'], axis=1) # independent
x

Unnamed: 0,Gender,Age,EstimatedSalary
0,0,1,1
1,0,1,3
2,1,1,5
3,1,1,6
4,0,1,6
...,...,...,...
395,1,1,5
396,0,1,3
397,1,1,3
398,0,1,4


In [14]:
x_train, x_test , y_train , y_test = test = train_test_split(x,y, test_size= 0.2)

In [15]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(320, 3)
(80, 3)
(320,)
(80,)


## Logistic Regression

In [16]:
model = LogisticRegression()

In [17]:
model.fit(x,y)

LogisticRegression()

In [18]:
model.score(x_train, y_train)

0.65

In [19]:
model.score(x_test, y_test)

0.7

In [20]:
y_predict = model.predict(x_test)

In [21]:
confusion_matrix(y_test, y_predict)

array([[56,  0],
       [24,  0]], dtype=int64)

### Preparing Dataset for Other Predictions

In [22]:
df2 = pd.read_csv('Social_Network_Ads.csv')
df2

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [23]:
df2['Gender'] = df2['Gender'].apply(gender)

In [24]:
df2

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,0,19,19000,0
1,15810944,0,35,20000,0
2,15668575,1,26,43000,0
3,15603246,1,27,57000,0
4,15804002,0,19,76000,0
...,...,...,...,...,...
395,15691863,1,46,41000,1
396,15706071,0,51,23000,1
397,15654296,1,50,20000,1
398,15755018,0,36,33000,0


In [25]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   User ID          400 non-null    int64
 1   Gender           400 non-null    int64
 2   Age              400 non-null    int64
 3   EstimatedSalary  400 non-null    int64
 4   Purchased        400 non-null    int64
dtypes: int64(5)
memory usage: 15.8 KB


In [26]:
df2.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

### Now lets split the data for training

In [27]:
y = df2['Purchased'] # dependent 
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [28]:
x = df2.drop(['User ID', 'Purchased'], axis=1) # independent
x

Unnamed: 0,Gender,Age,EstimatedSalary
0,0,19,19000
1,0,35,20000
2,1,26,43000
3,1,27,57000
4,0,19,76000
...,...,...,...
395,1,46,41000
396,0,51,23000
397,1,50,20000
398,0,36,33000


In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [30]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(320, 3)
(80, 3)
(320,)
(80,)


## Decision Tree Classifier

In [31]:
model2 = DecisionTreeClassifier()

In [32]:
model2.fit(x,y)

DecisionTreeClassifier()

In [33]:
model2.score(x_train , y_train)

0.996875

In [34]:
model2.score(x_test, y_test)

1.0

In [35]:
y_predict = model2.predict(x_test)

In [36]:
confusion_matrix(y_test, y_predict)

array([[53,  0],
       [ 0, 27]], dtype=int64)

## Random Forest Classifier

In [37]:
model3 = RandomForestClassifier()

In [38]:
model3.fit(x,y)

RandomForestClassifier()

In [39]:
y_predict = model3.predict(x_test)

In [40]:
model3.score(x_test, y_test)

1.0

In [41]:
accuracy_score(y_test, y_predict)

1.0

In [42]:
confusion_matrix(y_test, y_predict)

array([[53,  0],
       [ 0, 27]], dtype=int64)

## KNeighbours Classifier

In [43]:
model4 = KNeighborsClassifier(n_neighbors= 5)

In [44]:
model4.fit(x,y)

KNeighborsClassifier()

In [45]:
model4.score(x_train, y_train)

0.865625

In [46]:
model4.score(x_test, y_test)

0.9

In [47]:
y_predict = model4.predict(x_test)

In [48]:
confusion_matrix(y_test, y_predict)

array([[49,  4],
       [ 4, 23]], dtype=int64)

## Observation

### Prediction Accuracy

 - Logistic Regression - 60 to 70 % (fluctuates with each run)

 - DecisionTree Classifier - 98 to 100 % (fluctuates with each run)

 - RandomForest Classifier - 100% (no fluctuations)
 
 - KNeighbours Classifier - 85 to 95 % (fluctuates with each run)

From this observation Random forest has the highest accuracy and Logistic Regression has the least accuracy.