## RedHat Kaggle

https://www.kaggle.com/c/predicting-red-hat-business-value

### Load libraries

In [56]:
import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import multiprocessing

CPUS = multiprocessing.cpu_count()

In [57]:
CPUS

16

### Load data

In [58]:
DIR = '/'
FTRAIN = 'Data/act_train.csv.gz'
FTEST = 'Data/act_test.csv.gz'
FPEOPLE = 'Data/people.csv.gz'
FSAMPLE = 'Data/sample_submission.csv.gz'

In [59]:
train_raw = pd.read_csv(DIR+FTRAIN)
test_raw = pd.read_csv(DIR+FTEST)
people = pd.read_csv(DIR+FPEOPLE)

In [60]:
Y = train_raw['outcome']
del train_raw['outcome']

In [61]:
p = Y.value_counts()*1.0 / train_raw.shape[0]

In [62]:
N = train_raw.shape[0]
predictions = np.zeros((N,2))
for i in xrange(N):
    predictions[i,:] = p

In [63]:
accuracy_score(Y, np.argmax(predictions,axis=1))

0.55604560342712916

In [64]:
log_loss(Y, predictions)

0.68685173924161147

In [65]:
roc_auc_score(np.c_[Y==0,Y==1], predictions)

0.5

### Feature updates for sparse matrix

In [66]:
percentile_ref = people['char_38'].ravel()
compare_percentile = lambda x: round(sp.stats.percentileofscore(percentile_ref, x, kind='weak'),-1)
people['char_38'] = people['char_38'].apply(compare_percentile)

In [67]:
def format_col(x, p):
    if type(x)==str:
        return p+'_'+x.replace(' ','_')
    else:
        return p+'_'+str(x)

def update_cols(df,c=1):
    # fix date columns
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    del df['date']
    
    # get list of columns
    cols = list(df.columns)
    
    # include column name with value
    for p in cols[c:]:
        df.loc[:,p] = df.loc[:,p].apply(lambda x: format_col(x, p) )
    
    return df

In [68]:
people = update_cols(people, 1)
train_raw = update_cols(train_raw, 2)
test_raw = update_cols(test_raw, 2)

In [69]:
train_raw.index = train_raw['people_id']
test_raw.index = test_raw['people_id']
people.index = people['people_id']
del train_raw['people_id']
del test_raw['people_id']
del people['people_id']

### Transform data to sparse OHE matrix

In [70]:
train_full = pd.merge(train_raw, people, left_index=True, right_index=True)
test_full = pd.merge(test_raw, people, left_index=True, right_index=True)
train_full.index = train_full['activity_id']
test_full.index = test_full['activity_id']
del train_full['activity_id']
del test_full['activity_id']

In [71]:
sum(train_full.memory_usage())*1.0/1024**3

0.93315234035253525

In [72]:
sum(test_full.memory_usage())*1.0/1024**3

0.21178393810987473

##### Create OHE dictionary 

In [73]:
all_values = np.unique(train_full.as_matrix().reshape(1,-1).ravel())
ohe_dict= { k: v for k,v in zip(all_values, xrange(all_values.shape[0])) }

##### Create sparse matrices for test and training datasets

In [74]:
def create_sparse_data(df, ohe_dict):
    rows = []
    cols = []
    data = []
    for i in xrange(df.shape[0]):
        s = map(lambda x: ohe_dict.get(x,-1), df.iloc[i])
        while True:
            if -1 in s:
                indval = s.index(-1)
                del s[indval]
            else:
                break
        rows += [i]*len(s)
        cols += s
        data += [1]*len(s)
    return csr_matrix((np.array(data),
                (np.array(rows),
                 np.array(cols))),
               shape=(df.shape[0],len(ohe_dict)))

In [75]:
X = create_sparse_data(train_full, ohe_dict)

In [76]:
X_test = create_sparse_data(test_full, ohe_dict)

In [77]:
IDs = np.array(test_full.index)

##### Clear unused memory

In [78]:
del train_full 
del train_raw
del test_full
del test_raw
del ohe_dict
del all_values
del people 

#### Split train and validation

In [36]:
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  Y, 
                                                  test_size=0.33, 
                                                  random_state=2)

#### Test logistic regression models

In [37]:
def test_scores(y_test, predictions, pr=True):
    ll = log_loss(y_test, predictions)
    a = accuracy_score(y_test, np.argmax(predictions,axis=1))
    auc = roc_auc_score(np.c_[y_test==0, y_test==1], predictions)
    r = {'c':c,'logloss':ll,'accuracy':a,'AUC':auc}
    pstr = '''C:{c} Log-Loss:{logloss:.7f} Accuracy:{accuracy:.7f} AUC:{AUC:.7f}'''
    if pr: print pstr.format(**r)
    return r

In [None]:
cvalues = [ 1e1, 1e2, 1e3 ]
results = []
for c in cvalues:
    lr = LogisticRegression(C=c, max_iter=500, tol=1e-5, n_jobs=CPUS)
    lr.fit(X_train, y_train)
    predictions = lr.predict_proba(X_val)
    r = test_scores(y_val, predictions)
    results.append(r)

C:10.0 Log-Loss:0.0701360 Accuracy:0.9709064 AUC:0.9969400
C:100.0 Log-Loss:0.0672695 Accuracy:0.9709146 AUC:0.9970331
C:1000.0 Log-Loss:0.0684651 Accuracy:0.9708457 AUC:0.9969865


#### Run full model

In [38]:
c = 1e2
lr = LogisticRegression(C=c, max_iter=500, tol=1e-5, n_jobs=CPUS)
lr.fit(X, Y)
predictions = lr.predict_proba(X_test)

#### Save to CSV

In [90]:
submission = pd.DataFrame(np.vstack((IDs, 
                                     predictions[:,1])).T,
                          columns=['activity_id','outcome'])
submission.to_csv('submission.csv', index=False)

!gzip submission.csv
!s3put -bbrandonshurick -p/home/ubuntu/ -gpublic-read submission.csv.gz 