In [None]:
import numpy as np 
import pandas as pd 

from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.simplefilter(action='ignore')

In [None]:
# Read CSV  data file into DataFrame
data_df = pd.read_csv("https://raw.githubusercontent.com/abdullabasim/dataset/main/Social_Network_Ads.csv")

In [None]:
data_df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [None]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [None]:
data_df.drop("User ID",axis=1,inplace=True)

In [None]:
data_df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [None]:
data_info=pd.get_dummies(data_df,columns=["Gender"],drop_first=True)

data_info

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

# create X (features) and y (response)
X = data_info[["Age","EstimatedSalary","Gender_Male"]]
y = data_info['Purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=2)



In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(y_train)
print(X_test)

32     0
225    0
157    0
356    1
25     1
      ..
299    1
22     1
72     0
15     0
168    1
Name: Purchased, Length: 399, dtype: int64
[[-0.82841113  0.38986618 -0.98260737]]


In [None]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"],"solver":['newton-cg', 'lbfgs', 'liblinear']}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy : 0.8571794871794871


In [None]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C= 10.0, penalty= 'l2',random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression(C= 10.0, penalty= 'l2',random_state = 0)
_scoring = ['accuracy', 'precision', 'recall', 'f1']
results = cross_validate(estimator=logreg,
                               X=X_train,
                               y=y_train,
                               cv=5,
                               scoring=_scoring,
                               return_train_score=True)
      
print("Training Accuracy scores : " ,results['train_accuracy'])
print("Mean Training Accuracy : " ,results['train_accuracy'].mean()*100)  
print("Training Precision scores : " ,results['train_precision'])
print("Mean Training Precision :" ,results['train_precision'].mean())
print("Training Recall scores :" ,results['train_recall'])
print("Mean Training Recall :" ,results['train_recall'].mean())
print("Training F1 scores :" ,results['train_f1'])
print("Mean Training F1 Score :" ,results['train_f1'].mean())
print("Validation Accuracy scores :" ,results['test_accuracy'])
print("Mean Validation Accuracy:" ,results['test_accuracy'].mean()*100)
print("Validation Precision scores : " ,results['test_precision'])
print("Validation Recall scores :" ,results['test_recall'])
print("Mean Validation Recall :" ,results['test_recall'].mean())
print("Validation F1 scores :" ,results['test_f1'])
print("Mean Validation F1 Score :" ,results['test_f1'].mean())
  

Training Accuracy scores :  [0.87147335 0.84012539 0.85266458 0.84952978 0.8625    ]
Mean Training Accuracy :  85.52586206896551
Training Precision scores :  [0.8627451  0.81818182 0.84536082 0.83       0.85858586]
Mean Training Precision : 0.8429747199098323
Training Recall scores : [0.76521739 0.71052632 0.71929825 0.72807018 0.73913043]
Mean Training Recall : 0.7324485125858123
Training F1 scores : [0.81105991 0.76056338 0.77725118 0.77570093 0.79439252]
Mean Training F1 Score : 0.7837935861787679
Validation Accuracy scores : [0.8        0.9125     0.875      0.8625     0.82278481]
Mean Validation Accuracy: 85.45569620253166
Validation Precision scores :  [0.8        0.92307692 0.85185185 0.875      0.76923077]
Validation Recall scores : [0.57142857 0.82758621 0.79310345 0.72413793 0.71428571]
Mean Validation Recall : 0.7261083743842365
Validation F1 scores : [0.66666667 0.87272727 0.82142857 0.79245283 0.74074074]
Mean Validation F1 Score : 0.7788032163503862
