In [116]:
import statsmodels as sm
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [117]:
# load dataset
affair_data = sm.datasets.fair.load_pandas().data
affair_data.head(5)

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [118]:
# add "affair" column: 1 represents having affairs, 0 represents not
X_data = affair_data.drop(['affairs'], 1)
labels = (affair_data.affairs > 0).astype(int)
labels.value_counts()

0    4313
1    2053
Name: affairs, dtype: int64

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_data, labels,  test_size=0.3, random_state=0)
X_train.head(5)

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb
2411,5.0,27.0,6.0,1.0,2.0,16.0,2.0,6.0
4083,4.0,42.0,23.0,4.0,3.0,14.0,3.0,5.0
3196,5.0,37.0,23.0,4.0,2.0,14.0,3.0,5.0
3035,5.0,22.0,2.5,0.0,2.0,12.0,3.0,4.0
1772,3.0,22.0,2.5,0.0,3.0,14.0,5.0,4.0


In [120]:
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    df_str = df[cols].astype(str)
    vec_data = pd.DataFrame(vec.fit_transform(df_str[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    print(vec_data.columns)
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

In [124]:
# enc = OneHotEncoder(sparse= False)
# enc.fit(affair_data[['religious', 'educ', 'occupation', 'occupation_husb']])
# enc.transform([[1.0, 17.0, 4.0, 5.0]]).toarray()
ohe_columns = ['religious', 'educ', 'occupation', 'occupation_husb']

X_train_ohe = encode_onehot(X_train, ohe_columns)

X_train_ohe.head(5)

Index(['educ=12.0', 'educ=14.0', 'educ=16.0', 'educ=17.0', 'educ=20.0',
       'educ=9.0', 'occupation=1.0', 'occupation=2.0', 'occupation=3.0',
       'occupation=4.0', 'occupation=5.0', 'occupation=6.0',
       'occupation_husb=1.0', 'occupation_husb=2.0', 'occupation_husb=3.0',
       'occupation_husb=4.0', 'occupation_husb=5.0', 'occupation_husb=6.0',
       'religious=1.0', 'religious=2.0', 'religious=3.0', 'religious=4.0'],
      dtype='object')


Unnamed: 0,rate_marriage,age,yrs_married,children,educ=12.0,educ=14.0,educ=16.0,educ=17.0,educ=20.0,educ=9.0,...,occupation_husb=1.0,occupation_husb=2.0,occupation_husb=3.0,occupation_husb=4.0,occupation_husb=5.0,occupation_husb=6.0,religious=1.0,religious=2.0,religious=3.0,religious=4.0
2411,5.0,27.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4083,4.0,42.0,23.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3196,5.0,37.0,23.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3035,5.0,22.0,2.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1772,3.0,22.0,2.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [123]:
clf = LogisticRegression()
# clf= SVC(kernel='rbf')
scores = cross_validation.cross_val_score(clf, X_train_ohe, y_train, cv=5)
scores.mean()

0.72082081401961828