In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score



In [2]:
# load dataset
data = sm.datasets.fair.load_pandas().data
data.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [3]:
# add "affair" column: 1 represents having affairs, 0 represents not
data['affair'] = (data.affairs > 0).astype(int)

In [4]:
data['affair'].describe()

count    6366.000000
mean        0.322495
std         0.467468
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: affair, dtype: float64

In [5]:
# create dataframes with an intercept column and dummy variables for
# occupation and occupation_husb
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
                  religious + educ + C(occupation) + C(occupation_husb)',
                  data, return_type="dataframe")

In [6]:
# The column names for the dummy variables are ugly, so let's rename those.
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
                        'C(occupation)[T.3.0]':'occ_3',
                        'C(occupation)[T.4.0]':'occ_4',
                        'C(occupation)[T.5.0]':'occ_5',
                        'C(occupation)[T.6.0]':'occ_6',
                        'C(occupation_husb)[T.2.0]':'occ_husb_2',
                        'C(occupation_husb)[T.3.0]':'occ_husb_3',
                        'C(occupation_husb)[T.4.0]':'occ_husb_4',
                        'C(occupation_husb)[T.5.0]':'occ_husb_5',
                        'C(occupation_husb)[T.6.0]':'occ_husb_6'})

In [16]:
# flatten y into a 1-D array
# y = np.ravel(y)

In [7]:
# split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
# training the algorithm
model = LogisticRegression()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
model.predict(sample.reshape(1, -1))

  """Entry point for launching an IPython kernel.


array([ 0.])

In [30]:
test_predictions = model.predict(X_test)

In [31]:
metrics.accuracy_score(test_predictions, y_test)

0.72303664921465971

In [32]:
metrics.confusion_matrix(test_predictions, y_test)

array([[1154,  409],
       [ 120,  227]])

In [None]:
# Further reading
# http://nbviewer.jupyter.org/gist/justmarkham/6d5c061ca5aee67c4316471f8c2ae976
# http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/

# http://www.dataschool.io/machine-learning-with-scikit-learn/