## RedHat Kaggle

https://www.kaggle.com/c/predicting-red-hat-business-value

##### Load libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import multiprocessing
from collections import Counter, defaultdict
import hashlib
import re
from sklearn.learning_curve import validation_curve
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline


CPUS = multiprocessing.cpu_count()

In [2]:
CPUS

8

##### Custom Functions

In [3]:
def format_col(x, p):
    if type(x)==str:
        return p+'_'+x.replace(' ','_')
    else:
        return p+'_'+str(x)

def update_cols(df,c=1):
    # fix date columns
    df['date'] = pd.to_datetime(df['date'])
    
    # split date column into several 
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['qtr'] = df['date'].dt.month // 3
    
    # remove date 
    del df['date']
    
    # get list of columns
    cols = list(df.columns)
    
    # include column name with value
    for p in cols[c:]:
        df.loc[:,p] = df.loc[:,p].apply(lambda x: format_col(x, p) )
    
    return df

In [4]:
Brandon C Shurickdef test_scores(y_test, predictions, pr=True):
    ll = log_loss(y_test, predictions)
    a = accuracy_score(y_test, np.argmax(predictions,axis=1))
    auc = roc_auc_score(np.c_[y_test==0, y_test==1], predictions)
    r = {'c':c,'logloss':ll,'accuracy':a,'AUC':auc}
    pstr = '''C:{c} Log-Loss:{logloss:.7f} Accuracy:{accuracy:.7f} AUC:{AUC:.7f}'''
    if pr: print pstr.format(**r)
    return r

In [5]:
def create_sparse_ohe_data(df, ohe_dict):
    rows = []
    cols = []
    data = []
    for i in xrange(df.shape[0]):
        s = map(lambda x: ohe_dict.get(x,-1), df.iloc[i])
        while True:
            if -1 in s:
                indval = s.index(-1)
                del s[indval]
            else:
                break
        rows += [i]*len(s)
        cols += s
        data += [1]*len(s)
    return csr_matrix((np.array(data),
                (np.array(rows),
                 np.array(cols))),
               shape=(df.shape[0],len(ohe_dict)))

In [6]:
def create_sparse_hash_data(df, num_buckets):
    rows = []
    cols = []
    data = []
    for i in xrange(df.shape[0]):
        mapping = {}
        for f in df.iloc[i]:
            mapping[f] = int(int(hashlib.md5(f).hexdigest(), 16) % num_buckets)
        s = defaultdict(float)
        for bucket in mapping.values():
            s[bucket] += 1.0
        rows += [i]*len(s)
        cols += s.keys()
        data += s.values()
    return csr_matrix((np.array(data),
                (np.array(rows),
                 np.array(cols))),
               shape=(df.shape[0],num_buckets))

In [7]:
class ClusterLogisticRegression(LogisticRegression):
    def __init__(self, n_clusters=10, n_jobs=1, C=1.0, models=[], **params):
        self.models = [ LogisticRegression(n_jobs=n_jobs, C=C, **params) 
                           for i in range(n_clusters) ] if not models else models
        self.cluster_model = KMeans(n_clusters=n_clusters, n_jobs=n_jobs)
        self.classes = 2
        self.C = C
        self.n_clusters = n_clusters
        self.n_jobs = n_jobs
    
    def set_params(self, **params):
        for i in range(self.n_clusters):
            self.models[i].set_params(**params)
    
    def fit(self, X, y):
        self.cluster_model.fit(X)
        clusters = self.cluster_model.labels_
        for i in xrange(self.n_clusters):
            self.models[i] = self.models[i].fit(X[clusters==i], 
                                                y[clusters==i])
        return self
    
    def predict_proba(self, X):
        clusters = self.cluster_model.predict(X)
        predictions = np.zeros((X.shape[0], self.classes))
        for i in xrange(self.n_clusters):
            if np.sum(clusters==i)>0:
                predictions[clusters==i,:] = self.models[i].predict_proba(X[clusters==i])[:,:]
        return predictions
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

In [8]:
def lr_validation_curve(X, y, 
                        param_range= np.logspace(-3, 3, 3), 
                        max_iter=100, 
                        tol=1e-4, 
                        cv=5, 
                        n_clusters=4,
                        n_jobs=CPUS):
    lr_model = ClusterLogisticRegression(max_iter=max_iter, tol=tol, n_clusters=n_clusters)
    train_scores, test_scores = validation_curve(lr_model, X, y, 
                                param_name="C", param_range=param_range,
                                cv=cv, scoring="roc_auc", n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with Logistic Regression")
    plt.xlabel("C")
    plt.ylabel("AUC")
    plt.ylim(0.5, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    plt.show()

##### Load data

In [11]:
DIR = './'
FTRAIN = 'Data/act_train.csv.gz'
FTEST = 'Data/act_test.csv.gz'
FPEOPLE = 'Data/people.csv.gz'
FSAMPLE = 'Data/sample_submission.csv.gz'

In [12]:
train_raw = pd.read_csv(DIR+FTRAIN)
test_raw = pd.read_csv(DIR+FTEST)
people = pd.read_csv(DIR+FPEOPLE)

In [13]:
train_raw.shape

(2197291, 15)

In [14]:
test_raw.shape

(498687, 14)

In [15]:
people.shape

(189118, 41)

#### Set date types

In [16]:
train_raw['date'] = pd.to_datetime(train_raw['date'])
test_raw['date'] = pd.to_datetime(test_raw['date'])

##### Activity history reference file

In [17]:
train_raw['activity_category'].value_counts()

type 2    904683
type 5    490710
type 3    429408
type 4    207465
type 1    157615
type 6      4253
type 7      3157
Name: activity_category, dtype: int64

def gather_activity_history(values):
    for v in values:
        yield train_raw.groupby(level=0)['activity_category'].apply(lambda x: np.max(x==v))

values = set( train_raw['activity_category'].values )
histories = gather_activity_history( values )
activity_history = pd.concat( histories, axis=1 )
activity_history.columns = values

In [18]:
# gather minimum dates for each person-category
r = train_raw[['people_id',
               'date',
               'activity_category']].groupby(['people_id',
                                              'activity_category']
                                            ).apply(lambda x: np.min(x.date))
r = r.reset_index()
r.columns = ['people_id','activity_category','min_date']

# merge with df data
t = train_raw.loc[:,['people_id','activity_category','date']]
tm = pd.merge(t, r, on=['people_id','activity_category'], how='outer')

# create column for each category type 
def gather_history_cols(values):
    for v in sorted(values):
        check_func = lambda x: np.max((x.activity_category==v)&(x.date>=x.min_date))
        yield tm.groupby(['people_id','date']).apply(check_func)
values = set( train_raw.activity_category.values )
activity_history = pd.concat(gather_history_cols(values), axis=1)*1
activity_history.columns = map(lambda x: 'h'+str(x),range(len(list(values))))
activity_history = activity_history.reset_index()

##### Date probability reference file

In [19]:
date_probs = train_raw.groupby('date')['outcome'].apply(np.mean)
date_probs.columns = ['date_prob']

##### Outcome variable

In [20]:
Y = train_raw['outcome']
del train_raw['outcome']

##### Baseline model

In [21]:
p = Y.value_counts()*1.0 / train_raw.shape[0]

In [22]:
N = train_raw.shape[0]
predictions = np.zeros((N,2))
for i in xrange(N):
    predictions[i,:] = p

In [23]:
accuracy_score(Y, np.argmax(predictions,axis=1))

0.55604560342712916

In [24]:
log_loss(Y, predictions)

0.68685173924161147

In [25]:
roc_auc_score(np.c_[Y==0,Y==1], predictions)

0.5

##### Make numeric features categorical

In [26]:
percentile_ref = people['char_38'].ravel()
compare_percentile = lambda x: round(sp.stats.percentileofscore(percentile_ref, x, kind='weak'),-1)
people['char_38'] = people['char_38'].apply(compare_percentile)

##### Add date probability columns

In [27]:
percentile_ref = date_probs.ravel()
compare_percentile = lambda x: round(sp.stats.percentileofscore(percentile_ref, x, kind='weak'),-1)
date_probs_df = pd.DataFrame()
date_probs_df['date_prob_pctl'] = date_probs.apply(compare_percentile)
date_probs_df = date_probs_df.reset_index()
date_probs_df['date'] = pd.to_datetime(date_probs_df['date'])

In [28]:
def add_date_probs(df, date_probs):
    df = pd.merge(df, date_probs, 
                  how='left', on='date')
    return df

train_raw = add_date_probs(train_raw, date_probs_df)
test_raw = add_date_probs(test_raw, date_probs_df)

##### Add activity history columns

In [29]:
def add_history_cols(df, activity_history):
    df = pd.merge(df, activity_history,
                 how='left', on=['people_id','date'])
    return df

train_raw = add_history_cols(train_raw, activity_history)
test_raw = add_history_cols(test_raw, activity_history)

#### Reindex and clear memory

In [30]:
train_raw.index = train_raw['people_id']
test_raw.index = test_raw['people_id']
people.index = people['people_id']
del train_raw['people_id']
del test_raw['people_id']
del people['people_id']

##### Feature engineering

In [31]:
# updated people cols
people = update_cols(people, 0)

# train features
train_raw = update_cols(train_raw, 1)

# test features
test_raw = update_cols(test_raw, 1)

##### Transform data to sparse OHE matrix

In [32]:
train_full = pd.merge(train_raw, people, left_index=True, right_index=True)
test_full = pd.merge(test_raw, people, left_index=True, right_index=True)
train_full.index = train_full['activity_id']
test_full.index = test_full['activity_id']
del train_full['activity_id']
del test_full['activity_id']

In [33]:
sum(train_full.memory_usage())*1.0/1024**3

1.1296054646372795

In [34]:
sum(test_full.memory_usage())*1.0/1024**3

0.25637003034353256

##### Create OHE dictionary 

In [35]:
all_values = np.unique(train_full.as_matrix().reshape(1,-1).ravel())
ohe_dict= { k: v for k,v in zip(all_values, xrange(all_values.shape[0])) }

In [36]:
len(ohe_dict)

36809

##### Create sparse matrices for test and training datasets

In [37]:
X = create_sparse_ohe_data(train_full, ohe_dict)

In [38]:
X_test = create_sparse_ohe_data(test_full, ohe_dict)

In [39]:
IDs = np.array(test_full.index)

In [40]:
X.shape

(2197291, 36809)

##### Clear unused memory

In [41]:
del train_full 
del train_raw
del test_full
del test_raw
del people 
del ohe_dict
del all_values

##### Split train and validation

X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  Y, 
                                                  test_size=0.20, 
                                                  random_state=2)

### Logistic Regression

#### Check model parameter tuning
Using a sample of data to limit runtime.  
Assumes that sampled validation curve will match full dataset.

sample = np.random.choice(xrange(X.shape[0]),size=100000)
lr_validation_curve(X[sample,:], 
                    Y.iloc[sample], 
                    param_range= np.logspace(-3, 5, 6), 
                    n_clusters=6,
                    n_jobs=6)

#### Run full model

In [42]:
N = X.shape[0]
e = N/10*8
# X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2)
X_train, X_val, y_train, y_val = X[:e,:], X[e:,:], Y[:e], Y[e:]

In [43]:
c = 1e2
lr = LogisticRegression(C=c, max_iter=200, tol=1e-4, n_jobs=CPUS)
lr.fit(X_train, y_train)
predictions = lr.predict_proba(X_val)

In [44]:
_ = test_scores(y_val, predictions)

C:100.0 Log-Loss:0.2023555 Accuracy:0.9089653 AUC:0.9770715


<table align="left">
<tr>
    <th>Iter</th>
    <th>C</th>
    <th>Clusters</th>
    <th>Tol</th>
    <th>+Features</th>
    <th>AUC</th>
    <th>Log Loss</th>
    <th>Accuracy</th>
</tr>
<tr>
    <td>200</td>
    <td>100</td>
    <td>1</td>
    <td>1e-5</td>
    <td></td>
    <td>0.9767696</td>
    <td>0.2064201</td>
    <td>0.9075454</td>
</tr>
<tr>
    <td>200</td>
    <td>100</td>
    <td>1</td>
    <td>1e-5</td>
    <td>+Activity history</td>
    <td>0.9767806</td>
    <td>0.2046148</td>
    <td>0.9075249</td>
</tr>
<tr>
    <td>200</td>
    <td>100</td>
    <td>1</td>
    <td>1e-5</td>
    <td>+Date Prob</td>
    <td>0.9768128</td>
    <td>0.2063661</td>
    <td>0.9080028</td>
</tr>
<tr>
    <td>300</td>
    <td>100</td>
    <td>1</td>
    <td>1e-5</td>
    <td>+More accurate history</td>
    <td>0.9768163</td>
    <td>0.2062309</td>
    <td>0.9082030</td>
</tr>
<tr>
    <td>200</td>
    <td>100</td>
    <td>1</td>
    <td>1e-4</td>
    <td>-Iterations and tol</td>
    <td>0.9770576</td>
    <td>0.2029024</td>
    <td>0.9088720</td>
</tr>
<tr>
    <td>200</td>
    <td>100</td>
    <td>1</td>
    <td>1e-3</td>
    <td>-more tol</td>
    <td>0.9766761</td>
    <td>0.1920801</td>
    <td>0.9106515</td>
</tr>

</table>

In [167]:
F = (0.15*lr.coef_.shape[1])
A = np.argsort(np.abs(lr.coef_))[:,::-1].ravel()[:-F]

In [176]:
N = X.shape[0]
e = N/10*8
X_train, X_val, y_train, y_val = X[:e,A], X[e:,A], Y[:e], Y[e:]

In [None]:
c = 1e2
lr = LogisticRegression(C=c, max_iter=200, tol=1e-4, n_jobs=CPUS)
lr.fit(X_train, y_train)
predictions = lr.predict_proba(X_val)

In [None]:
_ = test_scores(y_val, predictions)

#### Submission

In [None]:
c     = 1e2
tol   = 1e-4
iters = 200
lr = LogisticRegression(C=c, max_iter=iters, tol=tol, n_jobs=CPUS)
lr.fit(X, Y)
predictions = lr.predict_proba(X_test)

In [None]:
submission = pd.DataFrame(np.vstack((IDs, 
                                     predictions[:,1])).T,
                          columns=['activity_id','outcome'])
submission.to_csv('submission.csv', index=False)

##### Use already-known predictions via forums

In [91]:
def add_leak_data(submission, loc='./Data/leak.csv.gz'):
    leak = pd.read_csv(loc)
    leak.columns = ['activity_id','leak_outcome']
    leak = leak.loc[(leak['leak_outcome']==1.0) | (leak['leak_outcome']==0.0),:]
    leak_submission = pd.merge(submission, leak, how='left', on='activity_id')
    nonleak = np.isnan(leak_submission['leak_outcome'])
    leak_submission.loc[nonleak,'leak_outcome'] = leak_submission.loc[nonleak, 'outcome']
    del leak_submission['outcome']
    leak_submission.columns = ['activity_id','outcome']
    return leak_submission

# leak_submission = add_leak_data(submission)
# leak_submission.to_csv('leak_submission.csv', index=False)

!gzip submission.csv
!s3put -bbrandonshurick -p/home/ubuntu/ -gpublic-read submission.csv.gz 

### XGBoost

In [45]:
import xgboost as xgb

In [46]:
def test_scores_xgb(y_val, predictions):
    pfull = np.c_[1-predictions,predictions]
    ll = log_loss(y_val, pfull)
    a = accuracy_score(y_val, np.argmax(pfull ,axis=1))
    auc = roc_auc_score(np.c_[y_val==0, y_val==1], pfull)
    r = {'logloss':ll,'accuracy':a,'AUC':auc}
    return r

In [47]:
dtrain = xgb.DMatrix( X_train, label=y_train.as_matrix() )
dval = xgb.DMatrix( X_val, label=y_val.as_matrix() )
dtest = xgb.DMatrix( X_test )

#### Train final model

In [121]:
d = 5
e = 0.01
param = {'bst:max_depth':d, 
         'bst:eta':e, 
         'subsample':0.5, 
         'colsample_bytree':0.7,
         'silent':1, 
         'lambda':1.0,
         'objective':'binary:logistic',
         'min_child_weight':0,
         'booster':'gblinear'}
param['eval_metric'] = 'logloss'
param['nthread'] = CPUS
evallist  = [(dval,'eval'), (dtrain,'train')]
xgb_model = xgb.train(param.items(), dtrain, 101, evallist, verbose_eval=50)

[0]	eval-logloss:0.283932	train-logloss:0.18327
[50]	eval-logloss:0.195622	train-logloss:0.080249
[100]	eval-logloss:0.195373	train-logloss:0.080235


In [122]:
predictions = xgb_model.predict( dval )
test_scores_xgb(y_val, predictions)

{'AUC': 0.97440198519505272,
 'accuracy': 0.90791404886462679,
 'logloss': 0.19537269320285294}

#### Submission

In [123]:
predictions = xgb_model.predict( dtest )

In [124]:
submission = pd.DataFrame(np.vstack((IDs, 
                                     predictions)).T,
                          columns=['activity_id','outcome'])
submission.to_csv('submission.csv', index=False)
leak_submission = add_leak_data(submission)
leak_submission.to_csv('leak_submission.csv', index=False)

In [118]:
!gzip leak_submission.csv 

leak_submission.csv.gz already exists -- do you wish to overwrite (y or n)? ^C
