In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, r2_score, roc_auc_score, accuracy_score, auc

In [2]:
df = pd.read_csv('Social_Network_Ads copy.csv')

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [5]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [6]:
df = df.drop(['User ID'],1)

In [7]:
df.shape

(400, 4)

In [8]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [9]:
df['Gender'] = (df['Gender'].astype('category')).cat.codes

In [10]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


In [11]:
X = df.drop(['Purchased'],1)
y = df['Purchased']

In [12]:
scalar = MinMaxScaler()

In [13]:
X = pd.DataFrame(scalar.fit_transform(X),columns = list(X))

In [14]:
log_reg = LogisticRegression()

In [15]:
from sklearn.model_selection import train_test_split as tts

In [16]:
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.3, random_state = 42)

In [17]:
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)



In [18]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [19]:
accuracy_score(y_test, y_pred)

0.85

In [20]:
confusion_matrix(y_test, y_pred)

array([[72,  1],
       [17, 30]], dtype=int64)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.99      0.89        73
           1       0.97      0.64      0.77        47

    accuracy                           0.85       120
   macro avg       0.89      0.81      0.83       120
weighted avg       0.87      0.85      0.84       120



In [22]:
log_reg.intercept_

array([-3.41879114])

In [23]:
log_reg.coef_

array([[-0.07513077,  3.91878291,  2.15175677]])

In [24]:
import statsmodels.api as sm

In [25]:
X = sm.add_constant(X)

  return ptp(axis=axis, out=out, **kwargs)


In [26]:
model = sm.Logit(y,X).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.344804
         Iterations 8


0,1,2,3
Dep. Variable:,Purchased,No. Observations:,400.0
Model:,Logit,Df Residuals:,396.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 08 Dec 2019",Pseudo R-squ.:,0.4711
Time:,23:00:03,Log-Likelihood:,-137.92
converged:,True,LL-Null:,-260.79
,,LLR p-value:,5.488e-53

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-7.9716,0.853,-9.340,0.000,-9.644,-6.299
Gender,0.3338,0.305,1.094,0.274,-0.264,0.932
Age,9.9527,1.108,8.984,0.000,7.781,12.124
EstimatedSalary,4.9196,0.739,6.659,0.000,3.471,6.368


In [27]:
roc_auc_score(y_test, y_pred)

0.8122996211017196

In [28]:
param = {
    'C': [0.001,0.01,0.1,1,10,100],
    'penalty': ['l1','l2']
}

In [29]:
param['C']

[0.001, 0.01, 0.1, 1, 10, 100]

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
model = GridSearchCV(cv = 10 , estimator=log_reg, param_grid= param)

In [32]:
model = model.fit(X,y)







In [33]:
model.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
model.best_score_

0.8275

In [35]:
model.best_params_

{'C': 1, 'penalty': 'l1'}