# DS-SF-33 | Unit Project 3: Machine Learning Modeling

In this project, you will perform a logistic regression on the admissions data we've been working with in Unit Projects 1 and 2.

In [31]:
import os
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
from sklearn import linear_model

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)



In [2]:
print (os.path.join('..', '..', 'dataset', 'dataset-ucla-admissions.csv'))

../../dataset/dataset-ucla-admissions.csv


In [3]:
df = pd.read_csv(os.path.join('..', '..', 'dataset', 'dataset-ucla-admissions.csv'))
df.dropna(inplace = True)

df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.00,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0
...,...,...,...,...
395,0,620.0,4.00,2.0
396,0,560.0,3.04,3.0
397,0,460.0,2.63,2.0
398,0,700.0,3.65,2.0


## Part A.  Frequency Table

> ### Question 1.  Create a frequency table for `prestige` and whether an applicant was admitted.

In [4]:
# TODO
pd.crosstab(df.prestige,df.admit, margins = True)

admit,0,1,All
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,28,33,61
2.0,95,53,148
3.0,93,28,121
4.0,55,12,67
All,271,126,397


In [5]:
pd.crosstab(df.prestige,df.admit,normalize=True, margins = True)

admit,0,1,All
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.070529,0.083123,0.153652
2.0,0.239295,0.133501,0.372796
3.0,0.234257,0.070529,0.304786
4.0,0.138539,0.030227,0.168766
All,0.68262,0.31738,1.0


In [6]:
pd.crosstab(df.prestige,df.admit,normalize=True).sum()

admit
0    0.68262
1    0.31738
dtype: float64

In [7]:
pd.crosstab(df.prestige,df.admit,normalize='index')

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.459016,0.540984
2.0,0.641892,0.358108
3.0,0.768595,0.231405
4.0,0.820896,0.179104


In [8]:
pd.crosstab(df.prestige,df.admit,normalize='columns')

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.103321,0.261905
2.0,0.350554,0.420635
3.0,0.343173,0.222222
4.0,0.202952,0.095238


## Part B.  Variable Transformations

> ### Question 2.  Create a one-hot encoding for `prestige`.

In [9]:
# Let's rescast prestige as int....
df.prestige = df.prestige.astype(int)
df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [10]:
one_hot = pd.get_dummies(df.prestige, prefix = 'prestige')
one_hot

Unnamed: 0,prestige_1,prestige_2,prestige_3,prestige_4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
395,0,1,0,0
396,0,0,1,0
397,0,1,0,0
398,0,1,0,0


In [11]:
df = df.join(other = one_hot)
df

Unnamed: 0,admit,gre,gpa,prestige,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,3,0,0,1,0
1,1,660.0,3.67,3,0,0,1,0
2,1,800.0,4.00,1,1,0,0,0
3,1,640.0,3.19,4,0,0,0,1
4,0,520.0,2.93,4,0,0,0,1
...,...,...,...,...,...,...,...,...
395,0,620.0,4.00,2,0,1,0,0
396,0,560.0,3.04,3,0,0,1,0
397,0,460.0,2.63,2,0,1,0,0
398,0,700.0,3.65,2,0,1,0,0


> ### Question 3.  How many of these binary variables do we need for modeling?

Answer: we created 4 but we only needed 3

> ### Question 4.  Why are we doing this?

Answer: we are doing this because of multicolinearity meaning 2 or more variables are related. they are linearly dependent on each other.

> ### Question 5.  Add all these binary variables in the dataset and remove the now redundant `prestige` feature.

In [12]:
# TODO
df.drop('prestige', inplace = True, axis = 1)
df

Unnamed: 0,admit,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.00,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0,1,0,0
396,0,560.0,3.04,0,0,1,0
397,0,460.0,2.63,0,1,0,0
398,0,700.0,3.65,0,1,0,0


## Part C.  Hand calculating odds ratios

Let's develop our intuition about expected outcomes by hand calculating odds ratios.

> ### Question 6.  Create a frequency table for `prestige = 1` and whether an applicant was admitted.

In [13]:
# TODO
df2 = df[['admit', 'prestige_1']]
#[df.prestige_ == 1]
pd.crosstab(df.prestige_1, df2.admit, normalize = 'index')

admit,0,1
prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.723214,0.276786
1,0.459016,0.540984


In [47]:
df3 = df[df.prestige_1 == 1]
df3.admit.value_counts()

1    33
0    28
Name: admit, dtype: int64

> ### Question 7.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the most prestigious undergraduate schools.

In [15]:
# TODO
#odds = p(admitted)/ (1-p(admitted))
tst = pd.crosstab(df2.prestige_1, df2.admit, normalize = "index")
p = tst.loc[1,1]
p


0.54098360655737709

In [27]:
odds_1 = p / (1-p)
odds_1 = 33 / 28 
odds_1

1.1785714285714286

> ### Question 8.  Now calculate the odds of admission for undergraduates who did not attend a #1 ranked college.

In [111]:
# TODO
p = tst.loc[0,1]
p

0.2767857142857143

In [28]:
odds_0 = 93/243
odds_0

0.38271604938271603

> ### Question 9.  Finally, what's the odds ratio?

In [29]:
# TODO
odds_1/odds_0

3.079493087557604

> ### Question 10.  Write this finding in a sentence.

Answer: A your prestige 1 goes up your odds ratio will be 3.079 meaning when you from prestige 1 to 2 your chances of being admitted will be higher.

> ### Question 11.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the least prestigious undergraduate schools.  Then calculate their odds ratio of being admitted to UCLA.  Finally, write this finding in a sentence.

In [42]:
# TODO
tst = pd.crosstab(df2.prestige_1, df2.admit, normalize = "index")
p = tst.loc[1,1]
p


0.54098360655737709

Answer:The odds of being admitted to UCLA from the least prestige school is 33/ 126 which is 26.2 percent

## Part C. Analysis using `statsmodels`

> ### Question 12.  Fit a logistic regression model predicting admission into UCLA using `gre`, `gpa`, and the `prestige` of the undergraduate schools.  Use the highest prestige undergraduate schools as your reference point.

In [16]:
# TODO
import statsmodels.api as sm
formula = "admit ~ gre + gpa + prestige_2 + prestige_3 + prestige_4"
#model = smf.glm(formula = formula, data = df, family = sm.families.Binomial())
#result = model.fit()
model = sm.Logit(df.admit, df[['gre','prestige_2', 'prestige_3', 'prestige_4']])
model.fit()
result = model.fit()
print (result.summary())



Optimization terminated successfully.
         Current function value: 0.589704
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.589704
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  397
Model:                          Logit   Df Residuals:                      393
Method:                           MLE   Df Model:                            3
Date:                Mon, 24 Apr 2017   Pseudo R-squ.:                 0.05629
Time:                        16:52:42   Log-Likelihood:                -234.11
converged:                       True   LL-Null:                       -248.08
                                        LLR p-value:                 3.759e-06
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
gre   

> ### Question 13.  Print the model's summary results.

In [17]:
# TODO
formula = "admit ~ gre + gpa + prestige_2 + prestige_3 + prestige_4"
model = smf.glm(formula = formula, data = df, family = sm.families.Binomial())
result = model.fit()
print (result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  admit   No. Observations:                  397
Model:                            GLM   Df Residuals:                      391
Model Family:                Binomial   Df Model:                            5
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -227.82
Date:                Mon, 24 Apr 2017   Deviance:                       455.64
Time:                        16:52:50   Pearson chi2:                     394.
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -3.8769      1.142     -3.393      0.001        -6.116    -1.638
gre            0.0022      0.001      2.028      0.0

> ### Question 14.  What are the odds ratios of the different features and their 95% confidence intervals?

In [18]:
# TODO
print ('coefficients')
print (result.params)
print(np.exp(result.params))
print()

coefficients
Intercept    -3.876854
gre           0.002218
gpa           0.779337
prestige_2   -0.680137
prestige_3   -1.338677
prestige_4   -1.553411
dtype: float64
Intercept     0.020716
gre           1.002221
gpa           2.180027
prestige_2    0.506548
prestige_3    0.262192
prestige_4    0.211525
dtype: float64



> ### Question 15.  Interpret the odds ratio for `prestige = 2`.

Answer: the odds ratio of being admitted at prestige = 2 will go down to 50%

> ### Question 16.  Interpret the odds ratio of `gpa`.

Answer: The odds ratio for gpa will go down 2.18 percent as your gpa goes down.

> ### Question 17.  Assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [50]:
# TODO
dict = {'gre' : 800, 'gpa': 4, 'prestige_2': 1, 'prestige_3':0, 'prestige_4' :0}
test = pd.DataFrame.from_dict(dict, orient = 'index').T
test
result.predict(test)

array([ 0.58299512])

In [56]:
dict = {'gre' : 800, 'gpa': 4, 'prestige_2': 0, 'prestige_3': 1, 'prestige_4' :0}
test = pd.DataFrame.from_dict(dict, orient = 'index').T
test
result.predict(test)

array([ 0.41983282])

In [54]:
dict = {'gre' : 800, 'gpa': 4, 'prestige_2': 0, 'prestige_3':0, 'prestige_4' :1}
test = pd.DataFrame.from_dict(dict, orient = 'index').T
test
result.predict(test)

array([ 0.36860803])

Answer: Prestige 2 has the highest percentage of admittence

## Part D. Moving the model from `statsmodels` to `sklearn`

> ### Question 18.  Let's assume we are satisfied with our model.  Remodel it (same features) using `sklearn`.  When creating the logistic regression model with `LogisticRegression(C = 10 ** 2)`.

In [34]:
# TODO
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(C= 10**2)
df.head()


Unnamed: 0,admit,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.0,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1


In [35]:
# features given by X
# features given by y
X = df[['gre', 'gpa', 'prestige_2', 'prestige_3', 'prestige_4']]
y = df.admit
lm.fit(X,y)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

> ### Question 19.  What are the odds ratios for the different variables and how do they compare with the odds ratios calculated with `statsmodels`?

In [36]:
# TODO
dat = pd.DataFrame(data = lm.coef_, columns = X.columns)
dat = dat.T
dat.columns = ['coef']
dat['odds_ratio'] = dat.coef.apply(np.exp)
dat

Unnamed: 0,coef,odds_ratio
gre,0.002158,1.002161
gpa,0.673155,1.960413
prestige_2,-0.628822,0.533219
prestige_3,-1.252227,0.285867
prestige_4,-1.568792,0.208297


Answer:

> ### Question 20.  Again, assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [37]:
# TODO
sample = pd.DataFrame.from_dict({'gre' : [800,800,800,800], \
                                 'gpa' : [4,4,4,4], \
                                 'prestige_2' : [0,1,0,0],
                                 'prestige_3' : [0,0,1,0],
                                 'prestige_4' : [0,0,0,1]})
test = sample[['gre', 'gpa', 'prestige_2', 'prestige_3', 'prestige_4']]

predictions = lm.predict_proba(test)
print(predictions)

[[ 0.28814605  0.71185395]
 [ 0.43153702  0.56846298]
 [ 0.58608936  0.41391064]
 [ 0.66024514  0.33975486]]


Answer: