In [107]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split

In [108]:
people = pd.read_csv("data/people.csv", parse_dates=['date'])
train = pd.read_csv("data/act_train.csv", parse_dates=['date'])
test = pd.read_csv("data/act_test.csv", parse_dates=['date'])

In [109]:
people['year'] = people['date'].dt.year
people['month'] = people['date'].dt.month
people['day'] = people['date'].dt.day
people.drop('date', axis=1, inplace=True)

train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train.drop('date', axis=1, inplace=True)

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test.drop('date', axis=1, inplace=True)

train = pd.merge(train, people, how='left', on='people_id', left_index=True)
test = pd.merge(test, people, how='left', on='people_id', left_index=True)
del people

In [110]:
target = 'outcome'
features = list(train.columns)
for var in ['activity_id', 'outcome', 'people_id']:
    features.remove(var)

In [125]:
#Leave One Out encoding of categorical variables
def LOOEncode(train, col_name, test=None, training=True):
    avg_target_by_col_group = train[[col_name, target]].groupby(col_name).mean().reset_index()
    avg_target_by_col_group.columns = [col_name, 'avg_target']
    outcomes = train[target].values
    if training:
        x = pd.merge(train[[col_name, target]], avg_target_by_col_group, 
                     how='left', 
                     on=col_name,
                     left_index=True)['avg_target']
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
    else:
        x = pd.merge(pd.DataFrame(test[col_name]), avg_target_by_col_group, 
                     how='left',
                     on=col_name,
                     left_index=True)['avg_target']    
    return x.fillna(x.mean())

In [138]:
train1, valid1 = train_test_split(train, test_size=0.3)

loo_train = pd.DataFrame({'outcome': train1['outcome']})
loo_valid = pd.DataFrame({'outcome': valid1['outcome']})
loo_test = pd.DataFrame()
for col in ['group_1', 'char_2_y', 'char_38']:
    if(col not in ['people_id', 'outcome']):
        loo_train[col] = LOOEncode(train1, col).values
        loo_valid[col] = LOOEncode(train1, col, test=valid1, training=False).values
        loo_test[col] = LOOEncode(train1, col, test=test, training=False).values

In [140]:
lr = LogisticRegression(C=100000.0)
lr.fit(loo_train[['group_1', 'char_2_y', 'char_38']], loo_train[target])

pred_train = lr.predict_proba(loo_train[['group_1', 'char_2_y', 'char_38']])[:,1]
pred_valid = lr.predict_proba(loo_valid[['group_1', 'char_2_y', 'char_38']])[:,1]

auc_train = roc_auc_score(loo_train[target], pred_train)
auc_valid = roc_auc_score(loo_valid[target], pred_valid)

print(auc_train, auc_valid)


0.997260954261 0.996661079501


In [141]:
result = lr.predict_proba(loo_test[['group_1', 'char_2_y', 'char_38']])[:,1]

submission = pd.DataFrame({'activity_id': test['activity_id'].values,
                           'outcome': result})

submission.to_csv('result.csv', index=False)