In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [5]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
smote = SMOTE(random_state=42)

In [8]:
X = df.drop(['Purchased'],1)
y = df['Purchased']

In [9]:
X.drop(['User ID'],1, inplace=True)

In [10]:
X.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000


In [11]:
# Label Encoding on Gender and then Standard Scalar

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [13]:
scalar = StandardScaler()
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])
X['Gender'] = X['Gender'].astype(str)

In [14]:
num = X.select_dtypes(include=np.number)
not_num = X.select_dtypes(exclude=np.number)

In [15]:
Scaled_num_cols = pd.DataFrame(scalar.fit_transform(num), columns=list(num))

In [16]:
X = pd.concat([Scaled_num_cols,not_num],1)

In [17]:
X['Gender'] = X['Gender'].astype(int)

In [18]:
X.dtypes

Age                float64
EstimatedSalary    float64
Gender               int32
dtype: object

In [19]:
smote

SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
      out_step='deprecated', random_state=42, ratio=None,
      sampling_strategy='auto', svm_estimator='deprecated')

In [20]:
y.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [21]:
X_res, y_res = smote.fit_sample(X,y.ravel())

In [22]:
X_res.shape

(514, 3)

In [23]:
y_res.shape

(514,)

In [24]:
# Logistic Regression on Original dataset and Scaled dataset

In [25]:
print(X.shape)
print(y.shape)

(400, 3)
(400,)


In [26]:
y.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

### Logistic Regression on Original X and y

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix, classification_report, r2_score, roc_auc_score, accuracy_score, auc
log_reg = LogisticRegression()

In [28]:
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.3, random_state = 42)

In [29]:
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)



In [30]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [31]:
accuracy_score(y_test, y_pred)

0.8583333333333333

In [32]:
confusion_matrix(y_test, y_pred)

array([[71,  2],
       [15, 32]], dtype=int64)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89        73
           1       0.94      0.68      0.79        47

    accuracy                           0.86       120
   macro avg       0.88      0.83      0.84       120
weighted avg       0.87      0.86      0.85       120



### Logistic Regression on Scaled X and y

In [34]:
print(X_res.shape)
print(y_res.shape)

(514, 3)
(514,)


In [35]:
y_res

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,

In [36]:
X_train_res, X_test_res, y_train_res, y_test_res = tts(X_res, y_res, test_size = 0.3, random_state = 42)

In [37]:
log_reg.fit(X_train_res, y_train_res)
y_pred_res = log_reg.predict(X_test_res)



In [38]:
accuracy_score(y_test_res, y_pred_res)

0.832258064516129

In [39]:
confusion_matrix(y_test, y_pred)

array([[71,  2],
       [15, 32]], dtype=int64)

In [40]:
print(classification_report(y_test_res, y_pred_res))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83        75
           1       0.86      0.80      0.83        80

    accuracy                           0.83       155
   macro avg       0.83      0.83      0.83       155
weighted avg       0.83      0.83      0.83       155



### Logistic Regression using GridSearch CV

In [41]:
import statsmodels.api as sm

In [42]:
X_cv = sm.add_constant(X)

  return ptp(axis=axis, out=out, **kwargs)


In [43]:
model = sm.Logit(y,X).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.390262
         Iterations 7


0,1,2,3
Dep. Variable:,Purchased,No. Observations:,400.0
Model:,Logit,Df Residuals:,397.0
Method:,MLE,Df Model:,2.0
Date:,"Sun, 08 Dec 2019",Pseudo R-squ.:,0.4014
Time:,23:07:56,Log-Likelihood:,-156.1
converged:,True,LL-Null:,-260.79
,,LLR p-value:,3.446e-46

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Age,2.1095,0.231,9.138,0.000,1.657,2.562
EstimatedSalary,1.0864,0.176,6.182,0.000,0.742,1.431
Gender,-0.9111,0.205,-4.443,0.000,-1.313,-0.509


In [44]:
param = {
    'C': [0.001,0.01,0.1,1,10,100],
    'penalty': ['l1','l2']
}

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
model = GridSearchCV(cv = 10 , estimator=log_reg, param_grid= param)

In [47]:
model = model.fit(X,y)







In [48]:
model.best_estimator_

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
model.best_score_

0.835

In [50]:
model.best_params_

{'C': 0.1, 'penalty': 'l1'}