In [1]:
import pandas as pd
import numpy as np
import sklearn.datasets as datasets
from sklearn.linear_model import LogisticRegression
import patsy

bc = datasets.load_breast_cancer()

X = pd.DataFrame(bc.data, columns=bc.feature_names)
Y = bc.target

### 1. What are logistic regression coefficients, really?

Logistic regression coefficients correspond to log odds. Not very useful.

log odds: $$P(Y = 1) = b_0 + b_1x_1 + ...b_nx_n$$

Like any regression this is a linear combination of our predictors times their coefficients.

Just to refresh you on ordinary least squares regression:

$$E[y] = b_0 + b_1x_1 + ... + b_nx_n$$

In OLS the coefficients are very interpretable. So, for example, if $b_1 = 3.5$, then every unit increase in $x_1$ corresponds to an expected 3.5 unit increase in the mean of y.

Understanding logreg coefs is more of a process...

### 2. Log Odds $\rightarrow$ odds, odds $\rightarrow$ probability

In [2]:
# np.esp is the equivalent of e^ whatever
# this is the inverse function of natural log
def logodds_to_odds(lo):
    return np.exp(lo)

def logodds_to_prob(lo):
    return np.exp(lo) / (1 + np.exp(lo))

# Odds Note:
# an odds ratio of 1:1 --> P = 0.5 (1/2)
# most common appearance is in horse racing
# odds ratio 4:1 -- > P = 0.8 (4/5)
# odds ratio 1:5 --> P = 0.1666 (1/6)

### 3. Intercept in the logistic regression

In [6]:
Xsub = X[['mean symmetry', 'worst radius']]
Xsub = (Xsub - Xsub.mean()) / Xsub.std()
lr = LogisticRegression()
lr.fit(Xsub, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
# get the coefs for the predictors
print lr.coef_
print lr.intercept_

[[-1.08904585 -4.69834081]]
[ 0.45678836]


In [17]:
# predicted probabilities for classes at Y index 100
y100_pp = lr.predict_proba(Xsub.ix[100, :])[0]
print np.round(y100_pp*1000)/1000.

[ 0.363  0.637]




In [14]:
# do this manually: 
X100 = Xsub.iloc[100, :].values
print X100

intercept = lr.intercept_
coefs = lr.coef_[0]

[-0.73909882  0.14913597]


In [15]:
X100_lo = lr.intercept_ + X100[0]*coefs[0] + X100[1]*coefs[1]
print X100_lo

[ 0.56100922]


In [16]:
# this is how to manually calculate for the whole row
print logodds_to_prob(X100_lo[0])

0.636686021862


In [19]:
# actual probability of 1 vs 0, probability of having cancer in our sample:
print np.mean(Y)

0.627416520211


### 4. Interpret individual coefficients - their impact on the probability!

In [18]:
intercept_p = logodds_to_prob(intercept)
print intercept_p

[ 0.61225201]


In [20]:
# what if mean symmetry was 1 standard deviation higher than the mean?
# but worst radius for the person is the mean
print coefs

[-1.08904585 -4.69834081]


In [23]:
prob_1std_ms = logodds_to_prob(intercept + coefs[0]*1 + coefs[1]*0)
print prob_1std_ms

[ 0.34699883]


In [24]:
# logodds of different values turned into probabilities
print 'probability of logodds 1:', logodds_to_prob(1)

# coefficient if 0 indicates equal odds for that predictor - no effect on the probability
# no matter what your predictor value is, it's multiplied by 0, so no effect
print 'probability of logodds 1:', logodds_to_prob(0)

probability of logodds 1: 0.73105857863
probability of logodds 1: 0.5


In [38]:
# ok, what is the effect, the change in odds, of a predictor, based on its coefficients?
# worst radius has a big, negative coefficient
# If I had 1 standard deviation higher worst radius, what is the change in my probability of having cancer?
# Here a 1 is NOT having cancer, a 0 IS having cancer
my_worst_radius_increase = 1
change_prob_having_cancer_1std_worst_radius = (1 - (logodds_to_prob(coefs[1]*my_worst_radius_increase)))
print change_prob_having_cancer_1std_worst_radius

my_mean_symmetry_increase = 1
change_prob_mean_symmetry = (1 - (logodds_to_prob(coefs[0]*my_mean_symmetry_increase)))
print change_prob_mean_symmetry

0.990971869282
0.748202006074


In [35]:
change_prob_having_cancer_1std_worst_radius

-0.49097186928181558