In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

SEED = 42

In [13]:
''' Fits a OneHotEncoder to a concatenation of the given data and returns the encoded data. '''
def encode_all_data(train, test):
    # Fit the OneHotEncoder to the training and testing data.
    enc = OneHotEncoder()
    enc.fit(train.append(test))

    # Encode the data.
    train_feats_encoded = enc.transform(train)
    test_encoded = enc.transform(test)
    
    return train_feats_encoded, test_encoded

In [14]:
train = pd.read_csv('/Users/shammakabir/EE379K/Lab8/data/train.csv')
test = pd.read_csv('/Users/shammakabir/EE379K/Lab8/data/test.csv')
train_features = train.drop(['ACTION', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2'], axis=1)
train_labels = train['ACTION']
test_trimmed = test.drop(['id', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2'], axis=1)

train_feats_encoded, test_encoded = encode_all_data(train_features, test_trimmed)

# Perform cross validation procedure 10 times, holding out 20% of training data as the CV set.
mean_err = 0.
mean_err_xgb = 0.
for k in range(10):
    # Split the training data into training and testing sets for cross-validation.
    x_train, x_cv, y_train, y_cv = train_test_split(train_feats_encoded, train_labels, test_size=0.2, random_state=k*SEED)

    # Create and fit the Logistic Regression model.
    log_reg = LogisticRegression()
    log_reg.fit(x_train, y_train)
    
    #create and fit XGB model
    xgb_model = xgb.XGBClassifier(max_depth=10, n_estimators=1000, learning_rate=0.15, objective='binary:logistic').fit(x_train, y_train)
    # Calculate training error.
    preds = log_reg.predict(x_cv)
    #preds = log_reg.predict_proba(x_cv)[:,1]
    
    
    preds_xgb = xgb_model.predict(x_cv)
    #preds = xgb_model.predict_proba(x_cv)[:,1]

    fold_err = log_reg.score(x_cv, y_cv)
    fold_err_xgb = xgb_model.score(x_cv, y_cv)
    
    mean_err += fold_err
    mean_err_xgb += fold_err_xgb
    
mean_err /= 10
mean_err_xgb /= 10
print mean_err, mean_err_xgb


# Retrain the model on the whole training set before making final predictions.
log_reg.fit(train_feats_encoded, train_labels)
xgb_model.fit(train_feats_encoded, train_labels)

# Make predictions on the test set and save to submission csv.
#test_preds = log_reg.predict(test_encoded)
test_preds = log_reg.predict_proba(test_encoded)[:,1]

test_preds_xgb = xgb_model.predict_proba(test_encoded)[:,1]

submission = pd.DataFrame({"Id":test.id, "Action":test_preds})
submission.to_csv('/Users/shammakabir/EE379K/Lab8/data/submission.csv', index=False)

submission_xgb = pd.DataFrame({"Id":test.id, "Action":test_preds_xgb})
submission.to_csv('/Users/shammakabir/EE379K/Lab8/data/submission_xgb.csv', index=False)

0.946963686298 0.949664327128
