## RedHat Kaggle

https://www.kaggle.com/c/predicting-red-hat-business-value

##### Load libraries

In [51]:
import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import multiprocessing
from collections import Counter
import re

CPUS = multiprocessing.cpu_count()

In [29]:
CPUS

16

##### Custom Functions

In [30]:
def format_col(x, p):
    if type(x)==str:
        return p+'_'+x.replace(' ','_')
    else:
        return p+'_'+str(x)

def update_cols(df,c=1):
    # fix date columns
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    del df['date']
    
    # get list of columns
    cols = list(df.columns)
    
    # include column name with value
    for p in cols[c:]:
        df.loc[:,p] = df.loc[:,p].apply(lambda x: format_col(x, p) )
    
    return df

In [31]:
def test_scores(y_test, predictions, pr=True):
    ll = log_loss(y_test, predictions)
    a = accuracy_score(y_test, np.argmax(predictions,axis=1))
    auc = roc_auc_score(np.c_[y_test==0, y_test==1], predictions)
    r = {'c':c,'logloss':ll,'accuracy':a,'AUC':auc}
    pstr = '''C:{c} Log-Loss:{logloss:.7f} Accuracy:{accuracy:.7f} AUC:{AUC:.7f}'''
    if pr: print pstr.format(**r)
    return r

In [211]:
def create_sparse_data(df, ohe_dict):
    rows = []
    cols = []
    data = []
    for i in xrange(df.shape[0]):
        s = map(lambda x: ohe_dict.get(x,-1), df.iloc[i])
        while True:
            if -1 in s:
                indval = s.index(-1)
                del s[indval]
            else:
                break
        rows += [i]*len(s)
        cols += s
        data += [1]*len(s)
    return csr_matrix((np.array(data),
                (np.array(rows),
                 np.array(cols))),
               shape=(df.shape[0],len(ohe_dict)))

In [193]:
def create_missing_val_lookup(df):
    mode_str = lambda x: Counter(list(x)).most_common(1)[0][0]
    char_cols = [ c for c in list(df.columns) if re.search('char_',c)]

    def find_char_modes(df, char_cols):
        for char_col in char_cols:
            nonnulls = df[char_col].apply(lambda x: str(x).lower()!='nan')
            char_r = df.loc[nonnulls, char_col].groupby(level=0).apply(mode_str)
            yield char_r

    def find_mode_overall(df, char_cols):
        for char_col in char_cols:
            nonnulls = df[char_col].apply(lambda x: str(x).lower()!='nan')
            freq_val = df.loc[nonnulls, char_col].reset_index().apply(mode_str)[char_col]
            yield freq_val

    # find most common result for each column-person
    char_col_results = find_char_modes(df, char_cols)

    # find most common result overall for each column
    freq_results = find_mode_overall(df, char_cols)

    # create lookup table for missing values
    missing_val_lookup = pd.DataFrame(pd.concat(char_col_results, axis=1), columns=char_cols)
    freq_lookup = pd.DataFrame(freq_results, index=char_cols, columns=['most_freq'])
    freq_lookup_dict = freq_lookup.to_dict()['most_freq']
    missing_val_lookup = missing_val_lookup.fillna(freq_lookup_dict, axis=0)
    
    return missing_val_lookup, freq_lookup_dict

##### Load data

In [32]:
DIR = '/'
FTRAIN = 'Data/act_train.csv.gz'
FTEST = 'Data/act_test.csv.gz'
FPEOPLE = 'Data/people.csv.gz'
FSAMPLE = 'Data/sample_submission.csv.gz'

In [33]:
train_raw = pd.read_csv(DIR+FTRAIN)
test_raw = pd.read_csv(DIR+FTEST)
people = pd.read_csv(DIR+FPEOPLE)

In [34]:
Y = train_raw['outcome']
del train_raw['outcome']

##### Baseline model

In [35]:
p = Y.value_counts()*1.0 / train_raw.shape[0]

In [36]:
N = train_raw.shape[0]
predictions = np.zeros((N,2))
for i in xrange(N):
    predictions[i,:] = p

In [37]:
accuracy_score(Y, np.argmax(predictions,axis=1))

0.55604560342712916

In [38]:
log_loss(Y, predictions)

0.68685173924161147

In [39]:
roc_auc_score(np.c_[Y==0,Y==1], predictions)

0.5

##### Handle null values

In [40]:
train_raw.describe()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
count,2197291,2197291,2197291,2197291,157615,157615,157615,157615,157615,157615,157615,157615,157615,2039676
unique,151295,2197291,411,7,51,32,11,7,7,5,8,18,19,6515
top,ppl_294918,act2_979099,2022-09-30,type 2,type 2,type 2,type 1,type 3,type 6,type 2,type 1,type 4,type 8,type 1
freq,55103,1,48174,904683,38030,50524,38224,98131,67989,61026,52548,77460,31794,904683


In [41]:
test_raw.describe()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
count,498687,498687,498687,498687,40092,40092,40092,40092,40092,40092,40092,40092,40092,458595
unique,37823,498687,411,7,48,31,11,7,6,5,8,18,19,3961
top,ppl_112017,act2_2659718,2022-09-16,type 2,type 2,type 2,type 1,type 3,type 6,type 2,type 1,type 4,type 8,type 1
freq,1650,1,10302,223164,9862,12757,9927,25046,17131,15940,13290,19803,7888,223164


In [42]:
train_raw.index = train_raw['people_id']
test_raw.index = test_raw['people_id']
people.index = people['people_id']
del train_raw['people_id']
del test_raw['people_id']
del people['people_id']

In [194]:
people_lookup, freq_lookup = create_missing_val_lookup(train_raw)
train_raw = train_raw.fillna( people_lookup, axis=0)

In [196]:
test_raw = test_raw.fillna( people_lookup, axis=0 )
test_raw = test_raw.fillna( freq_lookup, axis=0 )

##### Feature updates for sparse matrix

In [200]:
percentile_ref = people['char_38'].ravel()
compare_percentile = lambda x: round(sp.stats.percentileofscore(percentile_ref, x, kind='weak'),-1)
people['char_38'] = people['char_38'].apply(compare_percentile)

In [201]:
people = update_cols(people, 0)
train_raw = update_cols(train_raw, 1)
test_raw = update_cols(test_raw, 1)

##### Transform data to sparse OHE matrix

In [204]:
train_full = pd.merge(train_raw, people, left_index=True, right_index=True)
test_full = pd.merge(test_raw, people, left_index=True, right_index=True)
train_full.index = train_full['activity_id']
test_full.index = test_full['activity_id']
del train_full['activity_id']
del test_full['activity_id']

In [205]:
sum(train_full.memory_usage())*1.0/1024**3

0.93315234035253525

In [206]:
sum(test_full.memory_usage())*1.0/1024**3

0.21178393810987473

##### Create OHE dictionary 

In [207]:
all_values = np.unique(train_full.as_matrix().reshape(1,-1).ravel())
ohe_dict= { k: v for k,v in zip(all_values, xrange(all_values.shape[0])) }

##### Create sparse matrices for test and training datasets

In [212]:
X = create_sparse_data(train_full, ohe_dict)

In [213]:
X_test = create_sparse_data(test_full, ohe_dict)

In [214]:
IDs = np.array(test_full.index)

##### Clear unused memory

In [215]:
del train_full 
del train_raw
del test_full
del test_raw
del ohe_dict
del all_values
del people 

##### Split train and validation

In [216]:
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  Y, 
                                                  test_size=0.20, 
                                                  random_state=2)

### Logistic Regression

In [217]:
cvalues = [ 1e1, 1e2, 1e3 ]
results = []
for c in cvalues:
    lr = LogisticRegression(C=c, max_iter=500, tol=1e-5, n_jobs=CPUS)
    lr.fit(X_train, y_train)
    predictions = lr.predict_proba(X_val)
    r = test_scores(y_val, predictions)
    results.append(r)

C:10.0 Log-Loss:0.0677318 Accuracy:0.9720019 AUC:0.9971554
C:100.0 Log-Loss:0.0648277 Accuracy:0.9718040 AUC:0.9972451
C:1000.0 Log-Loss:0.0656197 Accuracy:0.9716629 AUC:0.9972100


#### Run full model

In [None]:
c = 1e2
lr = LogisticRegression(C=c, max_iter=500, tol=1e-5, n_jobs=CPUS)
lr.fit(X, Y)
predictions = lr.predict_proba(X_test)

#### Submission

In [None]:
submission = pd.DataFrame(np.vstack((IDs, 
                                     predictions[:,1])).T,
                          columns=['activity_id','outcome'])
submission.to_csv('submission.csv', index=False)

!gzip submission.csv
!s3put -bbrandonshurick -p/home/ubuntu/ -gpublic-read submission.csv.gz 

### XGBoost

In [27]:
import xgboost as xgb

In [28]:
def test_scores_xgb(y_val, predictions):
    pfull = np.c_[1-predictions,predictions]
    ll = log_loss(y_val, pfull)
    a = accuracy_score(y_val, np.argmax(pfull ,axis=1))
    auc = roc_auc_score(np.c_[y_val==0, y_val==1], pfull)
    r = {'logloss':ll,'accuracy':a,'AUC':auc}
    return r

In [29]:
dtrain = xgb.DMatrix( X_train, label=y_train.as_matrix() )
dval = xgb.DMatrix( X_val, label=y_val.as_matrix() )
dtest = xgb.DMatrix( X_test )

#### Train final model

In [None]:
d = 500
e = 0.01
param = {'max_depth':d, 
         'eta':e, 
         'subsample':0.5, 
         'colsample_bytree':0.75,
         'colsample_bylevel':0.75,
         'silent':1, 
         'objective':'binary:logistic' }
param['eval_metric'] = 'auc'
param['nthread'] = CPUS
evallist  = [(dval,'eval'), (dtrain,'train')]
xgb_model = xgb.train(param.items(), dtrain, 501, evallist, verbose_eval=50)

[0]	eval-auc:0.986746	train-auc:0.989671
[50]	eval-auc:0.997443	train-auc:0.998835


In [None]:
predictions = xgb_model.predict( dval )
test_scores_xgb(y_val, predictions)

#### Submission

In [None]:
predictions = xgb_model.predict( dtest )

In [None]:
submission = pd.DataFrame(np.vstack((IDs, 
                                     predictions)).T,
                          columns=['activity_id','outcome'])
submission.to_csv('submission.csv', index=False)

In [None]:
!gzip submission.csv !s3put -bbrandonshurick -p/home/ubuntu/ -gpublic-read submission.csv.gz