In [17]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np

# UCLA Admissions 
#### Data Source: UCLA's Logistic Regression in R tutorial <br/>
http://www.ats.ucla.edu/stat/r/dae/logit.htm

### The Data

Variable | Summary | Description | Type of Variable | Variable
---| ---| --- | ---
admit | admitted to UCLA or not | 1 admitted, 0 not admitted | binary | $Y$, Predictor, Target, Response, Dependent Variable
GRE | Graduate Record Examinations - standarized test |integers - range from 200 - 800 | discrete* | $X_0$, Predictors, Features, Independent Variable
GPA | Grade Point Average - summation rank of course grades  |floats with precision to the hundredths - range from 0.00 to 4.00 | continous | $X_1$, Predictors, Features, Independent Variable
prestige | rank of undergraduates' university | integers 4 to 1 (highest)| ordinal | $X_2$, $X_3$, $X_4$ Predictors, Features, Independent Variable

*although GRE score is technically discrete it will be treated as continuous

coefficients of model out put are the predictors <br/>
$y^*$ = $B_0$ + $B_1*GRE$ + $B_2*GPA$ + $B_3*prestige$ <br/>

then for logistic regression log of the coefficients <br/>
$p$ = $e^y$ / $e^y$ + 1

then 

In [120]:
df_raw = pd.read_csv("../assets/admissions.csv")
df = df_raw.dropna() 
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.0,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0


# Logistic Regression

#### Logistic Regression is used for making binary predictions 0 or 1, successs or fail, on or off. <br/>

### For prestige - make dummy variables 

Two way Frequency Table on Prestige and Admit

In [121]:
admit_prestige = pd.crosstab(index = df.admit, columns = df.prestige, margins = True)

admit_prestige.index = ["rejected", "accepted", "Total"]

admit_prestige

prestige,1.0,2.0,3.0,4.0,All
rejected,28,95,93,55,271
accepted,33,53,28,12,126
Total,61,148,121,67,397


## Dummy Variables

In [122]:
# use pandas api to create dummy variables
dummies = pd.get_dummies(df['prestige'], prefix = 'prestige')
df = df.join(dummies)
df.head()

Unnamed: 0,admit,gre,gpa,prestige,prestige_1.0,prestige_2.0,prestige_3.0,prestige_4.0
0,0,380.0,3.61,3.0,0.0,0.0,1.0,0.0
1,1,660.0,3.67,3.0,0.0,0.0,1.0,0.0
2,1,800.0,4.0,1.0,1.0,0.0,0.0,0.0
3,1,640.0,3.19,4.0,0.0,0.0,0.0,1.0
4,0,520.0,2.93,4.0,0.0,0.0,0.0,1.0


In [123]:
keepCols = ['admit', 'gre', 'gpa', 'prestige_1.0', 'prestige_2.0', 'prestige_3.0']
df = df[keepCols]
df.head()

Unnamed: 0,admit,gre,gpa,prestige_1.0,prestige_2.0,prestige_3.0
0,0,380.0,3.61,0.0,0.0,1.0
1,1,660.0,3.67,0.0,0.0,1.0
2,1,800.0,4.0,1.0,0.0,0.0
3,1,640.0,3.19,0.0,0.0,0.0
4,0,520.0,2.93,0.0,0.0,0.0


## Logistic Regression with sklearn api

In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
lm = LogisticRegression()

In [None]:
# df3.groupby(["prestige","admit"]).size()
# "drop" dummy variable that has the most occurance

In [129]:
X = df[df.columns[1:] ]
y = df.admit

#### LogisticRegressionCV()
`CV` = Cross Validation <br/>
Validating the model by splitting the dataset using K-Folds

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html

In [175]:
# internally preforms k-folds

# cv : integer or cross-validation generator
#     The default cross-validation generator used is Stratified K-Folds.
#     If an integer is provided, then it is the number of folds used.

# cv = 5
modelCVa = LogisticRegressionCV(cv=5)
results5 = modelCVa.fit(X ,y)
results5.coef_

array([[ 0.00222171,  0.70905267,  1.46735347,  0.79753278,  0.15500038]])

In [179]:
CoefCVa = results5.coef_
CoefCV5 = pd.DataFrame(CoefCVa)
CoefCV5

Unnamed: 0,0,1,2,3,4
0,0.002222,0.709053,1.467353,0.797533,0.155


In [191]:
np.exp(CoefCV5)

Unnamed: 0,0,1,2,3,4
0,1.002224,2.032065,4.33774,2.220057,1.167658


In [180]:
# cv = 20
modelCVb = LogisticRegressionCV(cv=20)
results20 = modelCVb.fit(X ,y)
results20.coef_

array([[ 0.00218322,  0.65028041,  1.37995181,  0.72782847,  0.09247397]])

In [182]:
CoefCVb = results20.coef_
CoefCV20 = pd.DataFrame(CoefCVb)
CoefCV20

Unnamed: 0,0,1,2,3,4
0,0.002183,0.65028,1.379952,0.727828,0.092474


In [193]:
np.exp(CoefCV20)

Unnamed: 0,0,1,2,3,4
0,1.002186,1.916078,3.97471,2.070579,1.096885


In [131]:
allPreds = results.predict(X)
print accuracy_score(y,allPreds)

0.707808564232


In [89]:
from sklearn.cross_validation import cross_val_score

In [189]:
scores = cross_val_score(modelCv, X, y, cv=5)

# cv : int, cross-validation generator or an iterable, optional
#     Determines the cross-validation splitting strategy.
#     Possible inputs for cv are:

#     - None, to use the default 3-fold cross-validation, <--- default is 3 - good to know
#     - integer, to specify the number of folds. <----- we are doing 5
#     - An object to be used as a cross-validation generator.
#     - An iterable yielding train/test splits.

print scores
# accuracy score of each k-fold

[ 0.7037037   0.72151899  0.69620253  0.6835443   0.67088608]


#### LogisticRegression() 
Logistic Regression WITHOUT Cross Validation

In [155]:
model = LogisticRegression()
results1 = model.fit(X, y)
results1.coef_


array([[  1.58889206e-03,   1.84630312e-04,   1.16761197e+00,
          5.26947989e-01,  -3.80822680e-02]])

In [186]:
resultsA = results1.coef_
resultsa = pd.DataFrame(resultsA)
resultsa

Unnamed: 0,0,1,2,3,4
0,0.001589,0.000185,1.167612,0.526948,-0.038082


In [165]:
allPreds2 = results1.predict(X)
print accuracy_score(y,allPreds2)

0.700251889169


In [133]:
df["predictionCol"] = allPreds
df.head()

Unnamed: 0,admit,gre,gpa,prestige_1.0,prestige_2.0,prestige_3.0,predictionCol
0,0,380.0,3.61,0.0,0.0,1.0,0
1,1,660.0,3.67,0.0,0.0,1.0,0
2,1,800.0,4.0,1.0,0.0,0.0,1
3,1,640.0,3.19,0.0,0.0,0.0,0
4,0,520.0,2.93,0.0,0.0,0.0,0


In [159]:
lm.fit(X,y)
# Logistic Regression fit on the whole dataset

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [160]:
print lm.coef_
print lm.intercept_
print df.admit.mean()

[[  1.58889206e-03   1.84630312e-04   1.16761197e+00   5.26947989e-01
   -3.80822680e-02]]
[-2.07018745]
0.317380352645


In [145]:
X.head(3)# the predictors

Unnamed: 0,gre,gpa,prestige_1.0,prestige_2.0,prestige_3.0
0,380.0,3.61,0.0,0.0,1.0
1,660.0,3.67,0.0,0.0,1.0
2,800.0,4.0,1.0,0.0,0.0


In [161]:
coef = lm.coef_
Coeff = pd.DataFrame(coef)
Coeff
# output model Coefficients
# predictor coefficients

Unnamed: 0,0,1,2,3,4
0,0.001589,0.000185,1.167612,0.526948,-0.038082


Formula <br/>

In [192]:
exp(Coeff)
# exponent of the Coefficients
# exponent of the coefficients is the odds ratio??
# so column 2 is prestige = 1 there is a 3x increase of odds??
# but for prestige = 4 how do you know the odds increase or decrease since it was dropped

Unnamed: 0,0,1,2,3,4
0,1.00159,1.000185,3.214308,1.693755,0.962634


In [147]:
p = exp(Coeff)/(exp(Coeff) + 1)
p# this is probability from the formula e^y / e^y + 1 where y = B_0 and B_1(prestige1) +B_2(prestige2) etc  ??

Unnamed: 0,0,1,2,3,4
0,0.500397,0.500046,0.762713,0.628771,0.490481


In [149]:
np.log(p)# then this right?

Unnamed: 0,0,1,2,3,4
0,-0.692353,-0.693055,-0.270873,-0.463988,-0.71237


Confusion Matrix <br/>
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py <br/>

In [150]:
# confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)

## coefficients and exp(coef)

predictors | sklearn coef | sklearn exp | sklearnCV=5 coef | sklearnCV=5 exp| sklearnCV=20 coef | sklearnCV=20 exp | Statsmodel coef | statsmodel exp | R tutorial coef | R tutorial exp
--- | --- | --- | --- | --- | --- | --- | --- | ---  |
gre        | 0.001589 | 1.00159  | 0.002222 | 1.002224 | 0.002183 |	1.002186 | 0.0022  |1.002221 | 0.0026  | 1.0023
gpa        | 0.000185 | 1.000185 | 0.709053 | 2.032065 | 0.65028  | 1.916078 | 0.7793  |2.180027 | 0.80404 | 2.2345
prestige=1 | 1.167612 | 3.214308 | 1.467353 | 4.33774  | 1.379952 | 3.97471  | -       |-        | -       | -
prestige=2 | 0.526948 | 1.693755 | 0.797533 | 2.220057 | 0.727828 | 2.070579 | -0.6801 |0.506548 | -0.67544| 0.5089
prestige=3 | -0.038082| 0.962634 | 0.155000 | 1.167658 | 0.092474 | 1.096885 | -1.3387 |0.262192 | -1.34020| 0.2618
prestige=4 | -        | -        |          |          |          |          | -1.5534 |0.211525 | -1.55146| 0.0185
intercept  | -2.070187|          |          |          |          |          | -3.8769 |0.020716 | -3.98998| 0.0185
		

In [195]:
>>>>

SyntaxError: invalid syntax (<ipython-input-195-d8ab50460a4a>, line 1)