# Project 3

In this project, you will perform a logistic regression on admissions data

In [82]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np

In [83]:
df = pd.read_csv("../assets/admissions.csv")
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.0,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
admit       400 non-null int64
gre         398 non-null float64
gpa         398 non-null float64
prestige    399 non-null float64
dtypes: float64(3), int64(1)
memory usage: 12.6 KB


In [85]:
df = df.dropna()

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 0 to 399
Data columns (total 4 columns):
admit       397 non-null int64
gre         397 non-null float64
gpa         397 non-null float64
prestige    397 non-null float64
dtypes: float64(3), int64(1)
memory usage: 15.5 KB


## Part 1. Frequency Tables

#### 1. Let's create a frequency table of our variables.  Look at the documentation for pd.crosstab

In [87]:
prestige = pd.crosstab(index=df['prestige'], columns='count')
prestige
#pd.crosstab(index=df['prestige'], columns='count').sum()

col_0,count
prestige,Unnamed: 1_level_1
1.0,61
2.0,148
3.0,121
4.0,67


In [88]:
print (prestige.sum(), "\n")   # sum the counts
print (prestige.shape, "\n")   # check number of rows and co
prestige.iloc[2:5]             # slice rows 3-4

col_0
count    397
dtype: int64 

(4, 1) 



col_0,count
prestige,Unnamed: 1_level_1
3.0,121
4.0,67


In [89]:
prestige/prestige.sum()

col_0,count
prestige,Unnamed: 1_level_1
1.0,0.153652
2.0,0.372796
3.0,0.304786
4.0,0.168766


In [90]:
admit = pd.crosstab(index=df['admit'], columns="count")
admit

col_0,count
admit,Unnamed: 1_level_1
0,271
1,126


In [91]:
comb = pd.crosstab(index=df['admit'], columns=df['prestige'])
comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4']
comb.index= ['admit 0','admit 1']
comb

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4
admit 0,28,95,93,55
admit 1,33,53,28,12


In [92]:
#you can get the marginal counts (totals for each row and column) by including the argument margins=True:

comb = pd.crosstab(index=df['admit'], columns=df['prestige'], margins=True)
comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4', 'rowtotal']
comb.index= ['admit 0','admit 1', 'coltotal']
comb

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,28,95,93,55,271
admit 1,33,53,28,12,126
coltotal,61,148,121,67,397


In [93]:
comb.sum()-comb.loc["coltotal"]

prestige 1     61
prestige 2    148
prestige 3    121
prestige 4     67
rowtotal      397
dtype: int64

In [94]:
#to get the total proportion of counts in each cell, divide the table by the grand total:

comb/comb.loc["coltotal","rowtotal"]

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,0.070529,0.239295,0.234257,0.138539,0.68262
admit 1,0.083123,0.133501,0.070529,0.030227,0.31738
coltotal,0.153652,0.372796,0.304786,0.168766,1.0


In [95]:
#to get the proportion of counts along each column (in this case, the admittance rate within each prestige class) divide by the column totals:

comb/comb.loc["coltotal"] 

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,0.459016,0.641892,0.768595,0.820896,0.68262
admit 1,0.540984,0.358108,0.231405,0.179104,0.31738
coltotal,1.0,1.0,1.0,1.0,1.0


In [96]:
#to get the proportion of counts along each row divide by the row totals. 
#the division operator functions on a row-by-row basis when used on DataFrames by default. 
#in this case we want to divide each column by the rowtotals column. 
#to get division to work on a column by column basis, use df.div() with the axis set to 0 (or "index"):

comb.div(comb["rowtotal"], axis=0)

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,0.103321,0.350554,0.343173,0.202952,1.0
admit 1,0.261905,0.420635,0.222222,0.095238,1.0
coltotal,0.153652,0.372796,0.304786,0.168766,1.0


In [97]:
#alternatively, you can transpose the table with df.T to swap rows and columns and perform row by row division as normal:

comb.T/comb["rowtotal"]

Unnamed: 0,admit 0,admit 1,coltotal
prestige 1,0.103321,0.261905,0.153652
prestige 2,0.350554,0.420635,0.372796
prestige 3,0.343173,0.222222,0.304786
prestige 4,0.202952,0.095238,0.168766
rowtotal,1.0,1.0,1.0


In [98]:
#the crosstab() function lets you create tables out of more than two categories. 
#higher dimensional tables can be a little confusing to look at, 
#but they can also yield finer-grained insight into interactions between multiple variables:

admit_gre_gpa_prestige = pd.crosstab(index=df['admit'], 
                             columns=[df['prestige'],
                                      df['gpa'],
                                      df['gre']],
                             margins=True)   # Include row and column totals

admit_gre_gpa_prestige

prestige,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,All
gpa,2.42,2.55,2.81,2.9,2.91,2.96,2.98,2.98,3.02,3.02,...,3.74,3.77,3.78,3.87,3.88,3.92,3.94,3.95,4.0,Unnamed: 21_level_1
gre,680.0,480.0,760.0,340.0,480.0,420.0,460.0,560.0,420.0,480.0,...,740.0,580.0,540.0,780.0,500.0,420.0,620.0,500.0,800.0,Unnamed: 21_level_2
admit,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,0,1,0,1,0,1,1,0,1,0,...,1,1,1,1,1,1,1,1,1,271
1,1,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,126
All,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,397


In [99]:
#notice that by passing a second or third variable to the columns argument, 
#the resulting table has columns categorized by both gre, gpa and prestige. 
#the outermost index (prestige) returns sections of the table instead of individual columns:

admit_gre_gpa_prestige[2]        # Get the subtable under prestige 2

gpa,2.42,2.52,2.62,2.62,2.63,2.67,2.69,2.70,2.71,2.73,...,3.95,3.98,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00
gre,560.0,560.0,480.0,520.0,460.0,480.0,420.0,540.0,500.0,520.0,...,660.0,680.0,480.0,520.0,580.0,620.0,660.0,700.0,780.0,800.0
admit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1,1,0,1,1,0,1,1,1,1,...,0,0,1,0,1,1,0,1,0,0
1,0,0,1,0,0,1,0,0,0,0,...,1,1,0,1,0,0,1,0,1,1
All,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Part 2. Return of dummy variables

In [100]:
#notes from previous lesson on dummy variables... 

In [101]:
# set a seed for reproducibility
np.random.seed(12345)

# create a Series of booleans in which roughly half are True
nums = np.random.rand(len(data))
mask_large = nums > 0.5

# initially set Size to small, then change roughly half to be large
data['size'] = 'small'
data.loc[mask_large, 'size'] = 'large'
data.head()

# create a new series called new_ser
data['new_ser'] = data['size'].map({'small':0, 'large':1})
data.head()

Unnamed: 0,admit,gre,gpa,Prestige2,Prestige3,Prestige4,size,new_ser
0,0,380.0,3.61,0,1,0,large,1
1,1,660.0,3.67,0,1,0,small,0
2,1,800.0,4.0,0,0,0,small,0
3,1,640.0,3.19,0,0,1,small,0
4,0,520.0,2.93,0,0,1,large,1


In [102]:
# set a seed for reproducibility
np.random.seed(123456)

# assign roughly one third of observations to each group
nums = np.random.rand(len(data))
mask_suburban = (nums > 0.33) & (nums < 0.66)
mask_urban = nums > 0.66
data['area'] = 'rural'
data.loc[mask_suburban, 'area'] = 'suburban'
data.loc[mask_urban, 'area'] = 'urban'
data.head()

Unnamed: 0,admit,gre,gpa,Prestige2,Prestige3,Prestige4,size,new_ser,area
0,0,380.0,3.61,0,1,0,large,1,rural
1,1,660.0,3.67,0,1,0,small,0,urban
2,1,800.0,4.0,0,0,0,small,0,rural
3,1,640.0,3.19,0,0,1,small,0,urban
4,0,520.0,2.93,0,0,1,large,1,suburban


In [103]:
# create four dummy variables using get_dummies, then exclude the first dummy column
dummies = pd.get_dummies(df['prestige'], prefix='prestige', drop_first=True)

# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
df1 = pd.concat([df, dummies], axis=1)
df1.head()

Unnamed: 0,admit,gre,gpa,prestige,prestige_2.0,prestige_3.0,prestige_4.0
0,0,380.0,3.61,3.0,0,1,0
1,1,660.0,3.67,3.0,0,1,0
2,1,800.0,4.0,1.0,0,0,0
3,1,640.0,3.19,4.0,0,0,1
4,0,520.0,2.93,4.0,0,0,1


#### 2.1 Create class or dummy variables for prestige 

In [104]:
prestige_dummies = pd.get_dummies(df['prestige'])
prestige_dummies.head()

Unnamed: 0,1.0,2.0,3.0,4.0
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1


In [105]:
prestige_dummies.rename(columns={1.0: 'Prestige1', 2.0: 'Prestige2', 3.0: 'Prestige3', 4.0: 'Prestige4'}, inplace=True)
prestige_dummies.head()

Unnamed: 0,Prestige1,Prestige2,Prestige3,Prestige4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1


#### 2.2 When modeling our class variables, how many do we need? 



Answer: 3 dummies are needed, always 1 less than the number of class variables

## Part 3. Hand calculating odds ratios

Develop your intuition about expected outcomes by hand calculating odds ratios.

In [106]:
cols_to_keep = ['admit', 'gre', 'gpa']
handcalc = df[cols_to_keep].join(prestige_dummies)
handcalc.head()

Unnamed: 0,admit,gre,gpa,Prestige1,Prestige2,Prestige3,Prestige4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.0,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1


In [107]:
#discovery calcs:

len(handcalc['admit'])
len(handcalc[handcalc['admit']==0])
handcalc['admit'].sum()
len(handcalc[handcalc['Prestige1']==0])
handcalc['Prestige1'].sum()
handcalc['Prestige1'].value_counts()

0    336
1     61
Name: Prestige1, dtype: int64

In [108]:
#it's unclear why the below code returns the columns in the incorrect order, i.e. column 'prestige 4' is really 'prestige 1'?

comb = pd.crosstab(index=handcalc['admit'], columns=[handcalc['Prestige1'],handcalc['Prestige2'],handcalc['Prestige3'],handcalc['Prestige4']], margins=True)
comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4', 'rowtotal']
comb.index = ['admit 0','admit 1', 'coltotal']
comb

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,55,93,95,28,271
admit 1,12,28,53,33,126
coltotal,67,121,148,61,397


In [109]:
pd.crosstab(df['admit'], df['prestige'], rownames=['admit'])

prestige,1.0,2.0,3.0,4.0
admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,28,95,93,55
1,33,53,28,12


In [110]:
comb = pd.crosstab(index=df['admit'], columns=df['prestige'])
comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4']
comb.index = ['admit 0','admit 1']
comb

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4
admit 0,28,95,93,55
admit 1,33,53,28,12


In [111]:
handcalc

Unnamed: 0,admit,gre,gpa,Prestige1,Prestige2,Prestige3,Prestige4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.00,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1
5,1,760.0,3.00,0,1,0,0
6,1,560.0,2.98,1,0,0,0
7,0,400.0,3.08,0,1,0,0
8,1,540.0,3.39,0,0,1,0
9,0,700.0,3.92,0,1,0,0


In [112]:
handcalc.describe()

Unnamed: 0,admit,gre,gpa,Prestige1,Prestige2,Prestige3,Prestige4
count,397.0,397.0,397.0,397.0,397.0,397.0,397.0
mean,0.31738,587.858942,3.392242,0.153652,0.372796,0.304786,0.168766
std,0.466044,115.717787,0.380208,0.36107,0.484159,0.460898,0.375017
min,0.0,220.0,2.26,0.0,0.0,0.0,0.0
25%,0.0,520.0,3.13,0.0,0.0,0.0,0.0
50%,0.0,580.0,3.4,0.0,0.0,0.0,0.0
75%,1.0,660.0,3.67,0.0,1.0,1.0,0.0
max,1.0,800.0,4.0,1.0,1.0,1.0,1.0


In [113]:
prestige_1 = pd.crosstab(index=handcalc['Prestige1'], columns='count')
prestige_1

col_0,count
Prestige1,Unnamed: 1_level_1
0,336
1,61


In [114]:
admit = pd.crosstab(index=handcalc['admit'], columns='count')
admit

col_0,count
admit,Unnamed: 1_level_1
0,271
1,126


In [115]:
# crosstab 'prestige 1' admission, indexed by 'admit'
# frequency table cutting prestige and whether or not someone was admitted
comb1 = pd.crosstab(index=handcalc['admit'], columns=handcalc['Prestige1'])
comb1.columns = ['not prestige 1', 'prestige 1'] #what determines the order for the column names?
comb1.index = ['admit 0','admit 1']
comb1

Unnamed: 0,not prestige 1,prestige 1
admit 0,243,28
admit 1,93,33


In [116]:
# crosstab 'prestige 1' admission, indexed by 'prestige 1'
# frequency table cutting prestige and whether or not someone was admitted
comb2 = pd.crosstab(index=handcalc['Prestige1'], columns=handcalc['admit'])
comb2.columns = ['admit 0','admit 1'] 
comb2.index = ['not prestige 1', 'prestige 1'] #what determines the order for the column names?
comb2

Unnamed: 0,admit 0,admit 1
not prestige 1,243,93
prestige 1,28,33


In [117]:
comb3 = pd.crosstab(handcalc['admit'], handcalc['Prestige1'], rownames=['admit'], colnames=['Prestige1'])
comb3

Prestige1,0,1
admit,Unnamed: 1_level_1,Unnamed: 2_level_1
0,243,28
1,93,33


In [118]:
comb4 = pd.crosstab(handcalc['Prestige1'], handcalc['admit'], rownames=['Prestige1'], colnames=['admit'])
comb4

admit,0,1
Prestige1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,243,93
1,28,33


#### 3.1 Use the cross tab above to calculate the odds of being admitted to grad school if you attended a #1 ranked college

In [119]:
comb4.iloc[1][1] / (comb4.iloc[1].sum() - comb4.iloc[1][1])

1.1785714285714286

odds ratio:  33:28

In [120]:
comb1

Unnamed: 0,not prestige 1,prestige 1
admit 0,243,28
admit 1,93,33


In [121]:
comb1['prestige 1']

admit 0    28
admit 1    33
Name: prestige 1, dtype: int64

In [122]:
comb1['not prestige 1']

admit 0    243
admit 1     93
Name: not prestige 1, dtype: int64

In [123]:
comb1['prestige 1']['admit 1'==1].sum() 
comb1['prestige 1']['admit 1'==0].sum() 

#why do both the above result in 28?

28

#### 3.2 Now calculate the odds of admission if you did not attend a #1 ranked college

In [124]:
comb4

admit,0,1
Prestige1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,243,93
1,28,33


In [125]:
comb4.iloc[0][1] 
comb4.loc[0][1]
#why does iloc and loc retrun the same result?

93

In [126]:
93+243

336

In [127]:
comb4.iloc[0][1] / (comb4.iloc[0].sum() - comb4.iloc[0][1])

0.38271604938271603

#### 3.3 Calculate the odds ratio

odds ratio:  93:243

#### 3.4 Write this finding in a sentenance: 

Answer: 

#### 3.5 Print the cross tab for prestige_4

In [128]:
comb5 = pd.crosstab(handcalc['Prestige4'], handcalc['admit'], rownames=['Prestige4'], colnames=['admit'])
comb5

admit,0,1
Prestige4,Unnamed: 1_level_1,Unnamed: 2_level_1
0,216,114
1,55,12


#### 3.6 Calculate the Odds Ratio 

12:55

In [129]:
12/(67-12)

0.21818181818181817

#### 3.7 Write this finding in a sentence

Answer:

## Part 4. Analysis

In [130]:
prestige_dummies.iloc[:, 1:] #the first section in [] is rows, and the second section is columns

Unnamed: 0,Prestige2,Prestige3,Prestige4
0,0,1,0
1,0,1,0
2,0,0,0
3,0,0,1
4,0,0,1
5,1,0,0
6,0,0,0
7,1,0,0
8,0,1,0
9,1,0,0


In [131]:
# create a clean data frame for the regression
cols_to_keep = ['admit', 'gre', 'gpa']

# Dropping one of the dummy columns
data = df[cols_to_keep].join(prestige_dummies.iloc[:, 1:]) #how does this drop work? the first section in [] is rows, and the second section is columns
data.head()

Unnamed: 0,admit,gre,gpa,Prestige2,Prestige3,Prestige4
0,0,380.0,3.61,0,1,0
1,1,660.0,3.67,0,1,0
2,1,800.0,4.0,0,0,0
3,1,640.0,3.19,0,0,1
4,0,520.0,2.93,0,0,1


#### if using statsmodel

We will add a constant term for our Logistic Regression. 

The statsmodels function requires that intercepts/constants are specified explicitly.

In [132]:
#have not pulled in stats model for the regression...

# manually add the intercept
#data['intercept'] = 1.0

#### 4.1 Create the X and Y variables

In [133]:
feature_cols = ['gre', 'gpa', 'Prestige2', 'Prestige3', 'Prestige4']
X = data[feature_cols] #create X (we are passing a list of arrays, so we don't need to use double [[]] to ensure it reads as a df)
y = data['admit']  #create y

#### 4.2 Fit the model - 

 - Load sklearn's logistic regression
 - Create the regression object
 - Fit the model

In [134]:
#fitting a logistic regression model and storing the class predictions

from sklearn.linear_model import LogisticRegression #load sklearn's logistic regression

logreg = LogisticRegression() #create regression object

logreg.fit(X, y) #fit
pred = logreg.predict(X) #predict

logreg.score(X, y) #this returns the accuracy

0.7052896725440806

#### 4.3 Print the coefficients

In [135]:
print (logreg.coef_)
print (logreg.intercept_)
print (df.admit.mean())

[[ 0.00178497  0.23229458 -0.60347467 -1.17214957 -1.37729795]]
[-1.81701706]
0.31738035264483627


In [136]:
admit_perc = 126 / (271+126)
admit_perc

0.31738035264483627

- if you throw zero 0 for all the y pred's, you would be right 68% of the time, that is not very good
- if you throw 1 for all the y pred's, you would be right 32% of the time, that is not very good

In [137]:
print (pred) #this is the predicton

[0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [138]:
from sklearn.metrics import precision_score as ps
from sklearn.metrics import recall_score as rc
from sklearn.metrics import confusion_matrix as cm

In [139]:
ps(y, pred) #this gives the precision

#true postive / (true postive + false positive)

#the group you identified correctly divided by the total group you identified

#this number should be as high as possible

#for cases like high versus low ride day --- need clarity on this ???

0.6216216216216216

In [140]:
rc(y, pred) #this gives the recall

#true postive / (true postive + false negative) 

#the group you identified correctly divided by the group total

#this number should be as high as possible

#for cases like cancer, you want to have a better recall and minimize the chance for false negatives

0.18253968253968253

In [141]:
cm(y, pred) #this gives the confusion matrix, which is easier to read with labels

array([[257,  14],
       [103,  23]], dtype=int64)

In [142]:
23 / (14+23) 

#precision
#23 is the true positive predicted correctly 
#14 is the false positive, precicted as admitted but they are not actually admitted

0.6216216216216216

In [143]:
23 / (23+103)

#recall
#23 is the true positive predicted correctly 
#103 is the false negative, precicted as not admitted but they are actually admitted

0.18253968253968253

#### 4.4 Calculate the odds ratios of the coeffiencents

hint 1: np.exp(X)

#### (from original project)

hint 2: conf['OR'] = params
        
           conf.columns = ['2.5%', '97.5%', 'OR']

- odds = probability / (1 - probability) i.e. one specific outcome/the rest of the other outcomes
- probability = odds / (1 + odds) i.e. one specific outcome/all outcomes

- logistic regression, compresses the linear reression to fit between 0 and 1 ???
- the np.exp(X) reverts it back ???

In [144]:
logreg.coef_ #this is a list of a list, which is why you need to index into it

array([[ 0.00178497,  0.23229458, -0.60347467, -1.17214957, -1.37729795]])

In [145]:
#logodds = logreg.intercept_ + logreg.coef_[0] * ???
#logodds

In [146]:
#this gives the odds ratio
params = logreg.coef_[0]
np.exp(params)

array([1.00178657, 1.26149128, 0.546908  , 0.3097005 , 0.25225925])

In [147]:
# Convert log odds to odds.
odds = np.exp(params)
odds

array([1.00178657, 1.26149128, 0.546908  , 0.3097005 , 0.25225925])

In [148]:
# Convert odds to probability.
prob = odds/(1 + odds)
prob

array([0.50044624, 0.5578139 , 0.35354915, 0.23646666, 0.20144331])

#### 4.5 Interpret the OR of Prestige_2

http://www.biochemia-medica.com/content/odds-ratio-calculation-usage-and-interpretation

Answer: 

- ppl who went to prestige 2 school, are 54% more likely to get admitted than prestige 1 students 
- bc prestige 1 is the dummy variable, or your base

#### 4.6 Interpret the OR of GPA

Answer: 

- for one unit increase in gpage you are 1.26149128 times to be admitted

## Bonus

Plot the probability of being admitted into graduate school, stratified by GPA and GRE score.



**(from original project - not part of current project)**

## Part 5: Predicted probablities

As a way of evaluating our classifier, we're going to recreate the dataset with every logical combination of input values. This will allow us to see how the predicted probability of admission increases/decreases across different variables. First we're going to generate the combinations using a helper function called cartesian (above).

We're going to use np.linspace to create a range of values for "gre" and "gpa". This creates a range of linearly spaced values from a specified min and maximum value--in our case just the min/max observed values.

In [149]:
def cartesian(arrays, out=None):
    """
    Generate a cartesian product of input arrays.
    Parameters
    ----------
    arrays : list of array-like
        1-D arrays to form the cartesian product of.
    out : ndarray
        Array to place the cartesian product in.
    Returns
    -------
    out : ndarray
        2-D array of shape (M, len(arrays)) containing cartesian products
        formed of input arrays.
    Examples
    --------
    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])
    """

    arrays = [np.asarray(x) for x in arrays]
    dtype = arrays[0].dtype

    n = np.prod([x.size for x in arrays])
    if out is None:
        out = np.zeros([n, len(arrays)], dtype=dtype)

    m = n / arrays[0].size
    out[:,0] = np.repeat(arrays[0], m)
    if arrays[1:]:
        cartesian(arrays[1:], out=out[0:m,1:])
        for j in xrange(1, arrays[0].size):
            out[j*m:(j+1)*m,1:] = out[0:m,1:]
    return out

In [152]:
# instead of generating all possible values of GRE and GPA, we're going
# to use an evenly spaced range of 10 values from the min to the max 
gres = np.linspace(data['gre'].min(), data['gre'].max(), 10)
print (gres)
# array([ 220.        ,  284.44444444,  348.88888889,  413.33333333,
#         477.77777778,  542.22222222,  606.66666667,  671.11111111,
#         735.55555556,  800.        ])
gpas = np.linspace(data['gpa'].min(), data['gpa'].max(), 10)
print (gpas)
# array([ 2.26      ,  2.45333333,  2.64666667,  2.84      ,  3.03333333,
#         3.22666667,  3.42      ,  3.61333333,  3.80666667,  4.        ])


# enumerate all possibilities
combos = pd.DataFrame(cartesian([gres, gpas, [1, 2, 3, 4], [1.]]))

[220.         284.44444444 348.88888889 413.33333333 477.77777778
 542.22222222 606.66666667 671.11111111 735.55555556 800.        ]
[2.26       2.45333333 2.64666667 2.84       3.03333333 3.22666667
 3.42       3.61333333 3.80666667 4.        ]


TypeError: slice indices must be integers or None or have an __index__ method

#### 5.1 Recreate the dummy variables

In [None]:
# recreate the dummy variables

# keep only what we need for making predictions

#### 5.2 Make predictions on the enumerated dataset

#### 5.3 Interpret findings for the last 4 observations