# Logistic Regression Test

In [10]:
import pandas as pd
import numpy as np

#Get the data
#The data is available here - https://www.kaggle.com/mohansacharya/graduate-admissions

data = pd.read_csv("C:/Users/amontagut/Desktop/Python/Datasets/Graduate_Admission_Kaggle/Admission_Predict_Ver1.1.csv")

data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [11]:
#Data Cleaning

#Remove Serial No. column since it's the same as the index
data = data.drop(columns="Serial No.")


data

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1,0.87
496,337,117,5,5.0,5.0,9.87,1,0.96
497,330,120,5,4.5,5.0,9.56,1,0.93
498,312,103,4,4.0,5.0,8.43,0,0.73


In [13]:
data.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [15]:
#Bin rows based on chance of admittance. >0.5 = 1, <0.5 =0
for x in data.index:
    if data["Chance of Admit "].loc[x] >= 0.5:
        data["Chance of Admit "].loc[x] =1
    else:
        data["Chance of Admit "].loc[x]=0




In [27]:
#Normalize Continuous Xs
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA']] = scaler.fit_transform(data[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA']])

data

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1.819238,1.778865,0.775582,1.137360,1.098944,1.776806,1,1.0
1,0.667148,-0.031601,0.775582,0.632315,1.098944,0.485859,1,1.0
2,-0.041830,-0.525364,-0.099793,-0.377773,0.017306,-0.954043,1,1.0
3,0.489904,0.462163,-0.099793,0.127271,-1.064332,0.154847,1,1.0
4,-0.219074,-0.689952,-0.975168,-1.387862,-0.523513,-0.606480,0,1.0
...,...,...,...,...,...,...,...,...
495,1.376126,0.132987,1.650957,1.137360,0.558125,0.734118,1,1.0
496,1.819238,1.614278,1.650957,1.642404,1.639763,2.140919,1,1.0
497,1.198882,2.108041,1.650957,1.137360,1.639763,1.627851,1,1.0
498,-0.396319,-0.689952,0.775582,0.632315,1.639763,-0.242367,0,1.0


In [29]:
#Make sure everything worked
data["Chance of Admit "].value_counts()

#Looks like there are a lot more people who were admitted than not. Let's see if that affects our classifier.

1.0    463
0.0     37
Name: Chance of Admit , dtype: int64

# Build the Model

Train-test Split

In [30]:
data_copy = data

training_data = data_copy.sample(frac=0.70, random_state=0) #70/30 train/test split
testing_data = data_copy.drop(training_data.index)



In [44]:
training_X = training_data[training_data.columns[:-1]]
training_y = training_data["Chance of Admit "]

testing_X = testing_data[testing_data.columns[:-1]]
testing_y = testing_data["Chance of Admit "]

In [47]:
#Now let's build our model

from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression()
log_reg_model.fit(training_X, training_y)

training_results = log_reg_model.predict(training_X)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [64]:
###Assess performance on training data





array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [78]:
from sklearn.metrics import accuracy_score
#Get accuracy
training_accuracy = accuracy_score(training_results, training_y)

print("Training accuracy: ", training_accuracy)

#Confusion Matrix

#confusion_matrix(training_results, training_y)


#ROC
########TBD


Training accuracy:  0.9542857142857143


NameError: name 'fpr' is not defined

<Figure size 432x288 with 0 Axes>

In [82]:
#Test data set

testing_results = log_reg_model.predict(testing_X)

#Get accuracy
testing_accuracy = accuracy_score(testing_results, testing_y)

print("Testing accuracy: ", testing_accuracy)

#Confusion Matrix
confusion_matrix(testing_results, testing_y)



Testing accuracy:  0.9333333333333333


array([[  6,   1],
       [  9, 134]], dtype=int64)