#### Load libraries

In [55]:
import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import multiprocessing

CPUS = multiprocessing.cpu_count()

In [56]:
CPUS

32

#### Load data

In [57]:
DIR = '/'
FTRAIN = 'Data/act_train.csv.gz'
FTEST = 'Data/act_test.csv.gz'
FPEOPLE = 'Data/people.csv.gz'
FSAMPLE = 'Data/sample_submission.csv.gz'

In [58]:
train_raw = pd.read_csv(DIR+FTRAIN)
test_raw = pd.read_csv(DIR+FTEST)
people = pd.read_csv(DIR+FPEOPLE)

In [59]:
Y = train_raw['outcome']
del train_raw['outcome']

In [None]:
p = Y.value_counts()*1.0 / train_raw.shape[0]

In [None]:
N = train_raw.shape[0]
predictions = np.zeros((N,2))
for i in xrange(N):
    predictions[i,:] = p

In [None]:
accuracy_score(Y, np.argmax(predictions,axis=1))

In [None]:
log_loss(Y, predictions)

In [None]:
roc_auc_score(np.c_[Y==0,Y==1], predictions)

#### Feature updates for sparse matrix

In [None]:
percentile_ref = people['char_38'].ravel()
compare_percentile = lambda x: round(sp.stats.percentileofscore(percentile_ref, x, kind='weak'),-1)
people['char_38'] = people['char_38'].apply(compare_percentile)

In [None]:
def format_col(x, p):
    if type(x)==str:
        return p+'_'+x.replace(' ','_')
    else:
        return p+'_'+str(x)

def update_cols(df,c=1):
    # fix date columns
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    del df['date']
    
    # get list of columns
    cols = list(df.columns)
    
    # include column name with value
    for p in cols[c:]:
        df.loc[:,p] = df.loc[:,p].apply(lambda x: format_col(x, p) )
    
    return df

In [None]:
people = update_cols(people, 1)
train_raw = update_cols(train_raw, 2)

In [None]:
train_raw.index = train_raw['people_id']
people.index = people['people_id']
del train_raw['people_id']
del people['people_id']

#### Transform data to sparse OHE matrix

In [None]:
train_full = pd.merge(train_raw, people, left_index=True, right_index=True)
train_full.index = train_full['activity_id']
del train_full['activity_id']

In [None]:
sum(train_full.memory_usage())*1.0/1024**3

In [None]:
all_values = np.unique(train_full.as_matrix().reshape(1,-1).ravel())
ohe_dict= { k: v for k,v in zip(all_values, xrange(all_values.shape[0])) }

In [None]:
rows = []
cols = []
data = []
for i in xrange(train_full.shape[0]):
    s = map(lambda x: ohe_dict[x], train_full.iloc[i])
    rows += [i]*len(s)
    cols += s
    data += [1]*len(s)

In [None]:
train_full_sp = csr_matrix((np.array(data),
                                (np.array(rows),
                                 np.array(cols))),
                               shape=(train_full.shape[0],len(ohe_dict)))

In [None]:
del rows
del cols
del data
del train_full 
del ohe_dict
del all_values
del train_raw
del people 

#### Split train and validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_full_sp, 
                                                    Y, 
                                                    test_size=0.33, 
                                                    random_state=2)

#### Test logistic regression models

In [None]:
cvalues = [ 1.0, 1e2, 1e3, 1e4 ]
results = []
for c in cvalues:
    lr = LogisticRegression(C=c, max_iter=500, tol=1e-5, n_jobs=CPUS)
    lr.fit(X_train, y_train)
    predictions = lr.predict_proba(X_test)
    ll = log_loss(y_test, predictions)
    a = accuracy_score(y_test, np.argmax(predictions,axis=1))
    auc = roc_auc_score(np.c_[y_test==0, y_test==1], predictions)
    r = {'c':c,'logloss':ll,'accuracy':a,'AUC':auc}
    print '''C:{c} Log-Loss:{logloss:.7f} Accuracy:{accuracy:.7f} AUC:{AUC:.7f}
    '''.format(**r)
    results.append()

In [None]:
pd.DataFrame(results)

#### Test SVM

In [None]:
svm = SVC(C=1.0, n_jobs=CPUS)
svm.fit(X_train, y_train)

In [None]:
predictions = svm.predict(X_test)
ll = log_loss(y_test, predictions)
a = accuracy_score(y_test, predictions)
print 'Accuracy: {}%, Log-loss: {}'.format(round(a*100,2),round(ll,4))