# CMPE428 Assignment 3
Building Logistic Regression Classifiers


Imports

In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm 
import step

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

Import CSV

In [2]:
df = pd.read_csv('stdData.csv')

In [3]:
df.head()

Unnamed: 0,Label,V1,V2,V3,V4,V5,V6,V7,V8
0,positive,1.2214,128.1012,80.035036,35.431417,180.956968,42.944951,1.320305,-0.779046
1,negative,2.609743,85.891549,58.543681,14.454311,52.545356,33.42607,-0.786571,-1.656509
2,negative,2.682163,99.782456,68.000884,26.339627,71.578043,37.542894,0.534953,-3.39499
3,negative,3.196969,115.189168,65.307845,-0.539337,0.269863,20.857287,0.562433,-0.534598
4,positive,4.790932,144.487763,80.80022,18.937774,-0.03357,31.346055,0.789162,-4.069492


## Task 1

### Split Dataset with Equal Positives and Negatives

We can see the amount of negatives and positives with this command

In [4]:
df.Label.value_counts()

negative    195
positive    105
Name: Label, dtype: int64

#### Replacing Categorical with Binary

In [5]:
df["Label"] = df["Label"].replace({"positive":1,"negative":0})

In [6]:
df.head()

Unnamed: 0,Label,V1,V2,V3,V4,V5,V6,V7,V8
0,1,1.2214,128.1012,80.035036,35.431417,180.956968,42.944951,1.320305,-0.779046
1,0,2.609743,85.891549,58.543681,14.454311,52.545356,33.42607,-0.786571,-1.656509
2,0,2.682163,99.782456,68.000884,26.339627,71.578043,37.542894,0.534953,-3.39499
3,0,3.196969,115.189168,65.307845,-0.539337,0.269863,20.857287,0.562433,-0.534598
4,1,4.790932,144.487763,80.80022,18.937774,-0.03357,31.346055,0.789162,-4.069492


#### Splitting Dataset Into 2, with Equal Amounts of Negative and Positive

We use stratify on y and split our dataframe into 2, so that we get equal distribution of negatives and positives.

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.5, stratify = y)

In [37]:
y_train.value_counts()

0    97
1    53
Name: Label, dtype: int64

In [38]:
y_test.value_counts()

0    98
1    52
Name: Label, dtype: int64

We can see that numbers of 0 and 1 is equal across test and train.

### Logistic Regression

We will use sklearn's logistic in order to get our accuracy, f1, recall and precision scores.

In [69]:
regression = LogisticRegression(solver = "liblinear")

model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Computing Scores

Scores are calculated and inserted into the dataframe to be easily compared with the other model's scores later on.

In [99]:
scores = {}
scores['Accuracy'] = accuracy_score(y_test, y_pred)
scores['F1'] = f1_score(y_test, y_pred)
scores['Recall'] = recall_score(y_test, y_pred)
scores['Precision'] = precision_score(y_test, y_pred)

scores = pd.DataFrame.from_dict(scores, orient='index', columns=['First DF'])
scores = scores.transpose()
scores

Unnamed: 0,Accuracy,F1,Recall,Precision
First DF,0.573333,0.288889,0.276596,0.302326


## Task 2

#### Identify Weak Variables by P-Value

Since sklearn doesn't have a way to check P-Value, I will use statsmodels to check P Values.

It can be seen that highest P values are V4,V5,V6 and V7 therefore it is the weakest data.

In [73]:
model = sm.Logit(y_train, X_train).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.544965
         Iterations 6


0,1,2,3
Dep. Variable:,Label,No. Observations:,150.0
Model:,Logit,Df Residuals:,142.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 15 Dec 2020",Pseudo R-squ.:,0.1609
Time:,21:12:05,Log-Likelihood:,-81.745
converged:,True,LL-Null:,-97.423
Covariance Type:,nonrobust,LLR p-value:,5.344e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
V1,0.1589,0.067,2.378,0.017,0.028,0.290
V2,0.0244,0.007,3.341,0.001,0.010,0.039
V3,-0.0653,0.015,-4.367,0.000,-0.095,-0.036
V4,-0.0164,0.016,-1.021,0.307,-0.048,0.015
V5,-0.0026,0.002,-1.080,0.280,-0.007,0.002
V6,0.0284,0.028,1.008,0.313,-0.027,0.083
V7,0.0525,0.193,0.272,0.785,-0.325,0.430
V8,0.0452,0.059,0.765,0.444,-0.071,0.161


#### Using Backward Elimination


In [74]:
step.forwardSelection(X_train, y_train)

Character Variables (Dummies Generated, First Dummies Dropped): []
Optimization terminated successfully.
         Current function value: 0.649486
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.615793
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.517354
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.649468
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.648133
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.649366
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.622093
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.649381
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.648314
         Iterations 4
Optimization te

(['intercept', 'V2', 'V5', 'V1'],

After the backward elimination, we came to conclusion that our final variables will be V2, V5 and V1.

In [103]:
new_x = X[['V1','V2','V5']]
new_x.head()

Unnamed: 0,V1,V2,V5
0,1.2214,128.1012,180.956968
1,2.609743,85.891549,52.545356
2,2.682163,99.782456,71.578043
3,3.196969,115.189168,0.269863
4,4.790932,144.487763,-0.03357


#### Generating New Model

We normally split data by 0.8 for train and 0.2 for test, but I will assume that it is required that we still have equal amounts of negatives and positives like in the Task 1.

In [104]:
X_train, X_test, y_train, y_test = train_test_split(new_x, y, test_size = 0.5, stratify = y)

In [84]:
new_model = regression.fit(X_train, y_train)
y_pred_new = model.predict(X_test)

#### New Scores

Here we calculate scores and append the new scores into our dataframe. 
0th row is first and 1st row is second logistic model results.

In [100]:
new_scores = {}
new_scores['Accuracy'] = accuracy_score(y_pred_new, y_pred)
new_scores['F1'] = f1_score(y_pred_new, y_pred)
new_scores['Recall'] = recall_score(y_pred_new, y_pred)
new_scores['Precision'] = precision_score(y_pred_new, y_pred)

all_scores = scores.append(new_scores, ignore_index=True)
all_scores

Unnamed: 0,Accuracy,F1,Recall,Precision
0,0.573333,0.288889,0.276596,0.302326
1,0.6,0.318182,0.311111,0.325581


In conclusion, we can see our accuracy, F1, Recall and Precision has increasd slightly after doing a backward step elimination. This shows that our new model has increased performance and accuracy now.