In [6]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
dta = sm.datasets.fair.load_pandas().data

# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)

# Create DF with a intercept coulmn and dummy variables for occupation and occupation_husb
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
religious + educ + C(occupation) + C(occupation_husb)',
dta, return_type="dataframe")

# Fix the column name
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})

# Flatten y into 1-D array
y = np.ravel(y)

#Split the whole data in train and test in ratio 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Lets apply Logistic regression and see the accuracy
regression = LogisticRegression()
regression.fit(X_train,y_train)

y_prd = regression.predict(X_test)


# Check the accuracy
acc = regression.score(X_test,y_test)

print("We are able to calculate the affair with {:.5}% accuracy by doing simple split of data set." \
              .format(acc*100))

# Let's try to do the same with cross validation of 15 folds
crs_vl = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=15)

print("We are able to calculate the affair with {:.5}% accuracy by doing cross validation with 15 fold of data set." \
              .format(crs_vl.mean() * 100))



We are able to calculate the affair with 74.254% accuracy by doing simple split of data set.
We are able to calculate the affair with 72.371% accuracy by doing cross validation with 15 fold of data set.
