<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Logistic Regression Lab


---

In [310]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [311]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score

## Load the college admissions data

---

To illustrate how to apply logistic regression, we will be using some basic college admissions data. The data only has a few columns:
- `admit`: a binary 1-0 variable indicating whether a student was admitted
- `gre`: the student's GRE score
- `gpa`: the student's GPA
- `prestige`: a rating for the "prestige" of the college

In [312]:
admissions = pd.read_csv(
    '../../../../resource-datasets/admissions/admissions.csv')

In [313]:
admissions.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.0,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0


## Clean the data

In [314]:
admissions.shape

(400, 4)

In [315]:
pd.set_option('display.max_rows',500, 'display.max_columns',10)

In [316]:
admissions.dtypes

admit         int64
gre         float64
gpa         float64
prestige    float64
dtype: object

In [317]:
admissions.isnull().sum()

admit       0
gre         2
gpa         2
prestige    1
dtype: int64

In [318]:
admissions.dropna(inplace=True)

In [319]:
admissions.isnull().sum()

admit       0
gre         0
gpa         0
prestige    0
dtype: int64

In [320]:
admissions.dtypes

admit         int64
gre         float64
gpa         float64
prestige    float64
dtype: object

### What are the probabilities of admittance by prestige?

In [321]:
prestige_prob = admissions.groupby('prestige').sum()
prestige_prob.drop(columns=['gre','gpa'],inplace=True)

In [322]:
admissions.groupby('prestige')['admit'].count()

prestige
1.0     61
2.0    148
3.0    121
4.0     67
Name: admit, dtype: int64

In [323]:
prestige_prob['prestige_count'] = admissions.groupby('prestige')['admit'].count()
prestige_prob

Unnamed: 0_level_0,admit,prestige_count
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,33,61
2.0,53,148
3.0,28,121
4.0,12,67


In [324]:
prestige_prob['P(admit|prestige)'] = prestige_prob.apply(lambda x: x['admit']/x['prestige_count'],axis=1)
prestige_prob

Unnamed: 0_level_0,admit,prestige_count,P(admit|prestige)
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,33,61,0.540984
2.0,53,148,0.358108
3.0,28,121,0.231405
4.0,12,67,0.179104


### Write a function to calculate odds and calculate the odds of admittance by prestige.

In [325]:
def odds_calc(prob):
    odds = prob / (1-prob)
    return odds

In [326]:
prestige_prob['Odds(admit|prestige)'] = prestige_prob['P(admit|prestige)'].map(odds_calc)
prestige_prob

Unnamed: 0_level_0,admit,prestige_count,P(admit|prestige),Odds(admit|prestige)
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,33,61,0.540984,1.178571
2.0,53,148,0.358108,0.557895
3.0,28,121,0.231405,0.301075
4.0,12,67,0.179104,0.218182


### Split into predictor data and target variable, dummify the prestige columns and standardize your predictors

In [327]:
from sklearn.preprocessing import StandardScaler

In [328]:
y = admissions.pop('admit')
X = admissions

In [329]:
X_dum = pd.get_dummies(X, columns=['prestige'],drop_first=True)
X_dum.head()

Unnamed: 0,gre,gpa,prestige_2.0,prestige_3.0,prestige_4.0
0,380.0,3.61,0,1,0
1,660.0,3.67,0,1,0
2,800.0,4.0,0,0,0
3,640.0,3.19,0,0,1
4,520.0,2.93,0,0,1


In [330]:
scaler = StandardScaler()

Xs = scaler.fit_transform(X_dum)
Xs = pd.DataFrame(Xs,columns=X_dum.columns)

Xs.head()

Unnamed: 0,gre,gpa,prestige_2.0,prestige_3.0,prestige_4.0
0,-1.798524,0.573457,-0.770959,1.510295,-0.450589
1,0.624209,0.731464,-0.770959,1.510295,-0.450589
2,1.835576,1.600504,-0.770959,-0.662122,-0.450589
3,0.451157,-0.532595,-0.770959,-0.662122,2.219318
4,-0.587158,-1.217294,-0.770959,-0.662122,2.219318


### Take only the dummified prestige columns as predictors and fit a logistic regression model

- How do you interpret the model coefficients in this case?
- What are the predicted probabilities for the different observations? Compare to the admission probabilities for different prestige levels calculated above. 
- Make sure to set the regularization strength to a very large value. 
- Use `solver='newton-cg'`.

In [331]:
prestige = Xs[['prestige_2.0','prestige_3.0','prestige_4.0']]

In [332]:
model = LogisticRegression(C=10**10,solver='newton-cg',max_iter=10000)
model.fit(prestige,y)

LogisticRegression(C=10000000000, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='warn', n_jobs=None,
                   penalty='l2', random_state=None, solver='newton-cg',
                   tol=0.0001, verbose=0, warm_start=False)

In [333]:
print('Model score:',model.score(prestige,y))
print('Model intercept:',model.intercept_)
print('Model coef:',model.coef_)

Model score: 0.6952141057934509
Model intercept: [-0.81510947]
Model coef: [[-0.36164002 -0.62819313 -0.6317558 ]]


In [334]:
model.predict_proba(prestige)

array([[0.76859502, 0.23140498],
       [0.76859502, 0.23140498],
       [0.45901645, 0.54098355],
       [0.8208955 , 0.1791045 ],
       [0.8208955 , 0.1791045 ],
       [0.6418919 , 0.3581081 ],
       [0.45901645, 0.54098355],
       [0.6418919 , 0.3581081 ],
       [0.76859502, 0.23140498],
       [0.6418919 , 0.3581081 ],
       [0.8208955 , 0.1791045 ],
       [0.45901645, 0.54098355],
       [0.45901645, 0.54098355],
       [0.6418919 , 0.3581081 ],
       [0.45901645, 0.54098355],
       [0.76859502, 0.23140498],
       [0.8208955 , 0.1791045 ],
       [0.76859502, 0.23140498],
       [0.6418919 , 0.3581081 ],
       [0.45901645, 0.54098355],
       [0.76859502, 0.23140498],
       [0.6418919 , 0.3581081 ],
       [0.8208955 , 0.1791045 ],
       [0.8208955 , 0.1791045 ],
       [0.6418919 , 0.3581081 ],
       [0.45901645, 0.54098355],
       [0.45901645, 0.54098355],
       [0.8208955 , 0.1791045 ],
       [0.6418919 , 0.3581081 ],
       [0.45901645, 0.54098355],
       [0.

In [335]:
model.classes_

array([0, 1])

In [336]:
model.predict_proba(prestige)[:,1]     # this will give just the Probs for Class = 1

array([0.23140498, 0.23140498, 0.54098355, 0.1791045 , 0.1791045 ,
       0.3581081 , 0.54098355, 0.3581081 , 0.23140498, 0.3581081 ,
       0.1791045 , 0.54098355, 0.54098355, 0.3581081 , 0.54098355,
       0.23140498, 0.1791045 , 0.23140498, 0.3581081 , 0.54098355,
       0.23140498, 0.3581081 , 0.1791045 , 0.1791045 , 0.3581081 ,
       0.54098355, 0.54098355, 0.1791045 , 0.3581081 , 0.54098355,
       0.1791045 , 0.23140498, 0.23140498, 0.23140498, 0.54098355,
       0.3581081 , 0.54098355, 0.23140498, 0.3581081 , 0.23140498,
       0.3581081 , 0.3581081 , 0.3581081 , 0.23140498, 0.3581081 ,
       0.23140498, 0.3581081 , 0.1791045 , 0.1791045 , 0.23140498,
       0.23140498, 0.1791045 , 0.1791045 , 0.3581081 , 0.23140498,
       0.23140498, 0.23140498, 0.23140498, 0.3581081 , 0.1791045 ,
       0.3581081 , 0.1791045 , 0.23140498, 0.23140498, 0.23140498,
       0.3581081 , 0.1791045 , 0.54098355, 0.54098355, 0.54098355,
       0.23140498, 0.1791045 , 0.1791045 , 0.3581081 , 0.17910

In [337]:
pd.Series(model.predict_proba(prestige)[:,1]).unique()

array([0.23140498, 0.54098355, 0.1791045 , 0.3581081 ])

In [338]:
pred_probs = pd.concat([X['prestige'],pd.DataFrame(model.predict_proba(prestige)[:,1])],axis=1)
pred_probs.columns = ['prestige','model.predict_proba']
pred_probs

Unnamed: 0,prestige,model.predict_proba
0,3.0,0.231405
1,3.0,0.231405
2,1.0,0.540984
3,4.0,0.179104
4,4.0,0.179104
5,2.0,0.358108
6,1.0,0.540984
7,2.0,0.358108
8,3.0,0.231405
9,2.0,0.358108


In [339]:
prestige_prob = pd.concat([prestige_prob,pred_probs.groupby('prestige').mean()],axis=1)
prestige_prob

Unnamed: 0_level_0,admit,prestige_count,P(admit|prestige),Odds(admit|prestige),model.predict_proba
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,33,61,0.540984,1.178571,0.433526
2.0,53,148,0.358108,0.557895,0.339236
3.0,28,121,0.231405,0.301075,0.272525
4.0,12,67,0.179104,0.218182,0.244416


### Use LogisticRegressionCV to tune your model

Test for both penalties. Which one leads to better results? Compare the model coefficients for the best respective models.

In [340]:
L1_Logistic = LogisticRegressionCV(Cs=100, penalty='l1', solver='liblinear', cv=5)
L1_Logistic.fit(Xs,y)
print(L1_Logistic.C_)
print(L1_Logistic.score(Xs,y))

[4.03701726]
0.707808564231738


In [341]:
L2_Logistic = LogisticRegressionCV(Cs=100, penalty='l2', solver='newton-cg', cv=5)
L2_Logistic.fit(Xs,y)
print(L2_Logistic.C_)
print(L2_Logistic.score(Xs,y))

[1.59228279]
0.707808564231738
