# Project 3

In this project, you will perform a logistic regression on admissions data

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np

  from pandas.core import datetools


In [3]:
df_raw = pd.read_csv("../assets/admissions.csv")
df = df_raw.dropna() 
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.0,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0


## Part 1. Frequency Tables

#### 1. Let's create a frequency table of our variables

In [26]:
df.describe()

Unnamed: 0,admit,gre,gpa,prestige
count,397.0,397.0,397.0,397.0
mean,0.31738,587.858942,3.392242,2.488665
std,0.466044,115.717787,0.380208,0.947083
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.4,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [69]:
# frequency table for prestige and whether or not someone was admitted

#check this link: http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-19_17.html

In [53]:
prestige = pd.crosstab(index=df['prestige'], columns="count")
prestige

col_0,count
prestige,Unnamed: 1_level_1
1.0,61
2.0,148
3.0,121
4.0,67


In [39]:
print (prestige.sum(), "\n")   # sum the counts
print (prestige.shape, "\n")   # check number of rows and co
prestige.iloc[1:5]             # slice rows 1-4

col_0
count    397
dtype: int64 

(4, 1) 



col_0,count
prestige,Unnamed: 1_level_1
2.0,148
3.0,121
4.0,67


In [38]:
prestige/prestige.sum()

col_0,count
prestige,Unnamed: 1_level_1
1.0,0.153652
2.0,0.372796
3.0,0.304786
4.0,0.168766


In [52]:
admit = pd.crosstab(index=df['admit'], columns="count")
admit

col_0,count
admit,Unnamed: 1_level_1
0,271
1,126


In [50]:
comb = pd.crosstab(index=df['admit'], columns=df['prestige'])
comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4']
comb.index= ['admit 0','admit 1']
comb

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4
admit 0,28,95,93,55
admit 1,33,53,28,12


In [68]:
#you can get the marginal counts (totals for each row and column) by including the argument margins=True:

comb = pd.crosstab(index=df['admit'], columns=df['prestige'], margins=True)
comb.columns = ['prestige 1','prestige 2','prestige 3','prestige 4', 'rowtotal']
comb.index= ['admit 0','admit 1', 'coltotal']
comb

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,28,95,93,55,271
admit 1,33,53,28,12,126
coltotal,61,148,121,67,397


In [67]:
#to get the total proportion of counts in each cell, divide the table by the grand total:

comb/comb.loc["coltotal","rowtotal"]

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,0.070529,0.239295,0.234257,0.138539,0.68262
admit 1,0.083123,0.133501,0.070529,0.030227,0.31738
coltotal,0.153652,0.372796,0.304786,0.168766,1.0


In [61]:
#to get the proportion of counts along each column (in this case, the admittance rate within each prestige class) divide by the column totals:

comb/comb.loc["coltotal"] 

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,0.459016,0.641892,0.768595,0.820896,0.68262
admit 1,0.540984,0.358108,0.231405,0.179104,0.31738
coltotal,1.0,1.0,1.0,1.0,1.0


In [66]:
#to get the proportion of counts along each row divide by the row totals. 
#the division operator functions on a row-by-row basis when used on DataFrames by default. 
#in this case we want to divide each column by the rowtotals column. 
#to get division to work on a column by column basis, use df.div() with the axis set to 0 (or "index"):

comb.div(comb["rowtotal"], axis=0)

Unnamed: 0,prestige 1,prestige 2,prestige 3,prestige 4,rowtotal
admit 0,0.103321,0.350554,0.343173,0.202952,1.0
admit 1,0.261905,0.420635,0.222222,0.095238,1.0
coltotal,0.153652,0.372796,0.304786,0.168766,1.0


In [70]:
#alternatively, you can transpose the table with df.T to swap rows and columns and perform row by row division as normal:

comb.T/comb["rowtotal"]

Unnamed: 0,admit 0,admit 1,coltotal
prestige 1,0.103321,0.261905,0.153652
prestige 2,0.350554,0.420635,0.372796
prestige 3,0.343173,0.222222,0.304786
prestige 4,0.202952,0.095238,0.168766
rowtotal,1.0,1.0,1.0


In [81]:
#the crosstab() function lets you create tables out of more than two categories. 
#higher dimensional tables can be a little confusing to look at, 
#but they can also yield finer-grained insight into interactions between multiple variables:

admit_gre_gpa_prestige = pd.crosstab(index=df['admit'], 
                             columns=[df['prestige'],
                                      df['gpa'],
                                      df['gre']],
                             margins=True)   # Include row and column totals

admit_gre_gpa_prestige

prestige,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,All
gpa,2.42,2.55,2.81,2.9,2.91,2.96,2.98,2.98,3.02,3.02,...,3.74,3.77,3.78,3.87,3.88,3.92,3.94,3.95,4.0,Unnamed: 21_level_1
gre,680.0,480.0,760.0,340.0,480.0,420.0,460.0,560.0,420.0,480.0,...,740.0,580.0,540.0,780.0,500.0,420.0,620.0,500.0,800.0,Unnamed: 21_level_2
admit,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,0,1,0,1,0,1,1,0,1,0,...,1,1,1,1,1,1,1,1,1,271
1,1,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,126
All,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,397


In [82]:
#notice that by passing a second or third variable to the columns argument, 
#the resulting table has columns categorized by both gre, gpa and prestige. 
#the outermost index (prestige) returns sections of the table instead of individual columns:

admit_gre_gpa_prestige[2]        # Get the subtable under prestige 2

gpa,2.42,2.52,2.62,2.62,2.63,2.67,2.69,2.70,2.71,2.73,...,3.95,3.98,4.00,4.00,4.00,4.00,4.00,4.00,4.00,4.00
gre,560.0,560.0,480.0,520.0,460.0,480.0,420.0,540.0,500.0,520.0,...,660.0,680.0,480.0,520.0,580.0,620.0,660.0,700.0,780.0,800.0
admit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1,1,0,1,1,0,1,1,1,1,...,0,0,1,0,1,1,0,1,0,0
1,0,0,1,0,0,1,0,0,0,0,...,1,1,0,1,0,0,1,0,1,1
All,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Part 2. Return of dummy variables

#### 2.1 Create class or dummy variables for prestige 

In [74]:
#notes from previous lesson on dummy variables... 

In [None]:
# set a seed for reproducibility
np.random.seed(12345)

# create a Series of booleans in which roughly half are True
nums = np.random.rand(len(data))
mask_large = nums > 0.5

# initially set Size to small, then change roughly half to be large
data['Size'] = 'small'
data.loc[mask_large, 'Size'] = 'large'
data.head()

In [None]:
# create a new Series called IsLarge
data['IsLarge'] = data['Size'].map({'small':0, 'large':1})
data.head()

In [None]:
# set a seed for reproducibility
np.random.seed(123456)

# assign roughly one third of observations to each group
nums = np.random.rand(len(data))
mask_suburban = (nums > 0.33) & (nums < 0.66)
mask_urban = nums > 0.66
data['Area'] = 'rural'
data.loc[mask_suburban, 'Area'] = 'suburban'
data.loc[mask_urban, 'Area'] = 'urban'
data.head()

In [80]:
# create four dummy variables using get_dummies, then exclude the first dummy column
dummies = pd.get_dummies(df['prestige'], prefix='prestige', drop_first=True)

# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
df1 = pd.concat([df, dummies], axis=1)
df1.head()

Unnamed: 0,admit,gre,gpa,prestige,prestige_2.0,prestige_3.0,prestige_4.0
0,0,380.0,3.61,3.0,0,1,0
1,1,660.0,3.67,3.0,0,1,0
2,1,800.0,4.0,1.0,0,0,0
3,1,640.0,3.19,4.0,0,0,1
4,0,520.0,2.93,4.0,0,0,1


#### 2.2 When modeling our class variables, how many do we need? 



Answer: 3

## Part 3. Hand calculating odds ratios

Develop your intuition about expected outcomes by hand calculating odds ratios.

In [None]:
cols_to_keep = ['admit', 'gre', 'gpa']
handCalc = df[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_1':])
print handCalc.head()

In [None]:
#crosstab prestige 1 admission 
# frequency table cutting prestige and whether or not someone was admitted


#### 3.1 Use the cross tab above to calculate the odds of being admitted to grad school if you attended a #1 ranked college

#### 3.2 Now calculate the odds of admission if you did not attend a #1 ranked college

#### 3.3 Calculate the odds ratio

#### 3.4 Write this finding in a sentenance: 

Answer: 

#### 3.5 Print the cross tab for prestige_4

#### 3.6 Calculate the OR 

#### 3.7 Write this finding in a sentence

Answer:

## Part 4. Analysis

In [None]:
# create a clean data frame for the regression
cols_to_keep = ['admit', 'gre', 'gpa']
data = df[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_2':])
print data.head()

We're going to add a constant term for our Logistic Regression. The statsmodels function we're going to be using requires that intercepts/constants are specified explicitly.

In [None]:
# manually add the intercept
data['intercept'] = 1.0

#### 4.1 Set the covariates to a variable called train_cols

#### 4.2 Fit the model

#### 4.3 Print the summary results

#### 4.4 Calculate the odds ratios of the coeffiencents and their 95% CI intervals

hint 1: np.exp(X)

hint 2: conf['OR'] = params
        
           conf.columns = ['2.5%', '97.5%', 'OR']

#### 4.5 Interpret the OR of Prestige_2

Answer: 

#### 4.6 Interpret the OR of GPA

Answer: 

## Part 5: Predicted probablities


As a way of evaluating our classifier, we're going to recreate the dataset with every logical combination of input values. This will allow us to see how the predicted probability of admission increases/decreases across different variables. First we're going to generate the combinations using a helper function called cartesian (above).

We're going to use np.linspace to create a range of values for "gre" and "gpa". This creates a range of linearly spaced values from a specified min and maximum value--in our case just the min/max observed values.

In [None]:
def cartesian(arrays, out=None):
    """
    Generate a cartesian product of input arrays.
    Parameters
    ----------
    arrays : list of array-like
        1-D arrays to form the cartesian product of.
    out : ndarray
        Array to place the cartesian product in.
    Returns
    -------
    out : ndarray
        2-D array of shape (M, len(arrays)) containing cartesian products
        formed of input arrays.
    Examples
    --------
    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])
    """

    arrays = [np.asarray(x) for x in arrays]
    dtype = arrays[0].dtype

    n = np.prod([x.size for x in arrays])
    if out is None:
        out = np.zeros([n, len(arrays)], dtype=dtype)

    m = n / arrays[0].size
    out[:,0] = np.repeat(arrays[0], m)
    if arrays[1:]:
        cartesian(arrays[1:], out=out[0:m,1:])
        for j in xrange(1, arrays[0].size):
            out[j*m:(j+1)*m,1:] = out[0:m,1:]
    return out

In [None]:
# instead of generating all possible values of GRE and GPA, we're going
# to use an evenly spaced range of 10 values from the min to the max 
gres = np.linspace(data['gre'].min(), data['gre'].max(), 10)
print gres
# array([ 220.        ,  284.44444444,  348.88888889,  413.33333333,
#         477.77777778,  542.22222222,  606.66666667,  671.11111111,
#         735.55555556,  800.        ])
gpas = np.linspace(data['gpa'].min(), data['gpa'].max(), 10)
print gpas
# array([ 2.26      ,  2.45333333,  2.64666667,  2.84      ,  3.03333333,
#         3.22666667,  3.42      ,  3.61333333,  3.80666667,  4.        ])


# enumerate all possibilities
combos = pd.DataFrame(cartesian([gres, gpas, [1, 2, 3, 4], [1.]]))

#### 5.1 Recreate the dummy variables

In [None]:
# recreate the dummy variables

# keep only what we need for making predictions


#### 5.2 Make predictions on the enumerated dataset

#### 5.3 Interpret findings for the last 4 observations

Answer: 

## Bonus

Plot the probability of being admitted into graduate school, stratified by GPA and GRE score.