In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
people = pd.read_csv('./data/people.csv')
train = pd.read_csv('./data/act_train.csv')
test = pd.read_csv('./data/act_test.csv')

In [3]:
# Leave one out technique
def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName).mean().reset_index()
    outcomes = data2['outcome'].values
    x = pd.merge(data2[[columnName, 'outcome']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=columnName,
                 left_index=True)['outcome']
    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
    return x.fillna(x.mean())

In [4]:
train = train[['people_id', 'outcome']]
test = test[['activity_id', 'people_id']]
people = people[['people_id','group_1','char_2','char_38']]

In [5]:
len(train[train.people_id.isnull()]), len(train[train.outcome.isnull()]), len(train), len(test), len(people)

(0, 0, 2197291, 498687, 189118)

In [6]:
train = pd.merge(train, people, how='left', on='people_id', left_index=True)

In [7]:
train.head()

Unnamed: 0,people_id,outcome,group_1,char_2,char_38
0,ppl_100,0,group 17304,type 2,36
0,ppl_100,0,group 17304,type 2,36
0,ppl_100,0,group 17304,type 2,36
0,ppl_100,0,group 17304,type 2,36
0,ppl_100,0,group 17304,type 2,36


In [19]:
len(train)

2197291

In [20]:
lootrain = pd.DataFrame()
for col in train.columns:
    if(col != 'outcome' and col != 'people_id'):
        print(col)
        lootrain[col] = LeaveOneOut(train, train, col, True).values

group_1
char_2
char_38


In [23]:
lootrain.describe()

Unnamed: 0,group_1,char_2,char_38
count,2197291.0,2197291.0,2197291.0
mean,0.443954,0.443954,0.443954
std,0.475832,0.335642,0.356512
min,0.0,0.0,0.0
25%,0.0,0.0,0.027053
50%,0.000224,0.698855,0.63937
75%,1.0,0.698855,0.772505
max,1.0,0.698856,0.857501


In [24]:
lr = LogisticRegression(C=100000.0)
lr.fit(lootrain[['group_1', 'char_2', 'char_38']], train['outcome'])

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)

In [25]:
preds = lr.predict_proba(lootrain[['group_1', 'char_2', 'char_38']])[:, 1]
print('roc', roc_auc_score(train.outcome, preds))

('roc', 0.99723112931524527)


In [30]:
lr.classes_

array([0, 1])

In [31]:
test = pd.merge(test, people, how='left', on='people_id', left_index=True)

In [32]:
test.head()

Unnamed: 0,activity_id,people_id,group_1,char_2,char_38
3,act1_249281,ppl_100004,group 22593,type 3,76
3,act2_230855,ppl_100004,group 22593,type 3,76
5,act1_240724,ppl_10001,group 25417,type 3,90
5,act1_83552,ppl_10001,group 25417,type 3,90
5,act2_1043301,ppl_10001,group 25417,type 3,90


In [34]:
any(test.group_1.isnull()), any(test.char_2.isnull()), any(test.char_38.isnull()), len(test)

(False, False, False, 498687)

In [35]:
activity_id = test.activity_id.values

In [36]:
test.drop('activity_id', inplace=True, axis=1)
test['outcome'] = 0

In [37]:
test.head()

Unnamed: 0,people_id,group_1,char_2,char_38,outcome
3,ppl_100004,group 22593,type 3,76,0
3,ppl_100004,group 22593,type 3,76,0
5,ppl_10001,group 25417,type 3,90,0
5,ppl_10001,group 25417,type 3,90,0
5,ppl_10001,group 25417,type 3,90,0


In [38]:
lootest = pd.DataFrame()
for col in train.columns:
    if(col != 'outcome' and col != 'people_id'):
        print(col)
        lootest[col] = LeaveOneOut(train, test, col, False).values

group_1
char_2
char_38


In [40]:
lootest.head()

Unnamed: 0,group_1,char_2,char_38
0,0,0.698855,0.727121
1,0,0.698855,0.727121
2,1,0.698855,0.796851
3,1,0.698855,0.796851
4,1,0.698855,0.796851


In [42]:
lootest.describe()

Unnamed: 0,group_1,char_2,char_38
count,498687.0,498687.0,498687.0
mean,0.50705,0.465964,0.483574
std,0.448213,0.328566,0.348502
min,0.0,0.0,0.0
25%,0.0,0.0,0.027053
50%,0.50705,0.698855,0.670208
75%,1.0,0.698855,0.775806
max,1.0,0.698855,0.857501


In [43]:
preds = lr.predict_proba(lootest[['group_1', 'char_2', 'char_38']])[:, 1]
submission = pd.DataFrame()
submission['activity_id'] = activity_id
submission['outcome'] = preds
submission.to_csv('submission-09-04.csv', index=False, float_format='%.3f')