#### Load libraries

In [1]:
import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

#### Load data

In [2]:
DIR = '/Users/brandonshurick/School/Extras/Kaggle/RedHat/'
FTRAIN = 'Data/act_train.csv.gz'
FTEST = 'Data/act_test.csv.gz'
FPEOPLE = 'Data/people.csv.gz'
FSAMPLE = 'Data/sample_submission.csv.gz'

In [3]:
train_raw = pd.read_csv(DIR+FTRAIN)
test_raw = pd.read_csv(DIR+FTEST)
people = pd.read_csv(DIR+FPEOPLE)

In [4]:
Y = train_raw['outcome']
del train_raw['outcome']
p = Y.value_counts() / train_raw.shape[0]

In [5]:
p

0    0.556046
1    0.443954
Name: outcome, dtype: float64

In [6]:
N = train_raw.shape[0]
predictions = np.zeros((N,2))
for i in xrange(N):
    predictions[i,:] = p

In [7]:
log_loss(Y, predictions)

0.68685173924161147

#### Feature updates for sparse matrix

In [None]:
percentile_ref = people['char_38'].ravel()
compare_percentile = lambda x: round(sp.stats.percentileofscore(percentile_ref, x, kind='weak'),-1)
people['char_38'] = people['char_38'].apply(compare_percentile)

In [8]:
def format_col(x, p):
    if type(x)==str:
        return p+'_'+x.replace(' ','_')
    else:
        return p+'_'+str(x)

def update_cols(df):
    # calculate size 
    start_size = np.sum(df.memory_usage())
    
    # fix date columns
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    del df['date']
    
    # get list of columns
    cols = list(df.columns)
    
    # include column name with value
    for p in cols[1:]:
        df.loc[:,p] = df.loc[:,p].apply(lambda x: format_col(x, p) )
    
    # return data
    end_size = np.sum(df.memory_usage())
    print 'start size: {}, end size: {}'.format(start_size, end_size)
    return df

In [9]:
people = update_cols(people)
train_raw = update_cols(train_raw)

start size: 24963648, end size: 65056664
start size: 246096664, end size: 281253320


In [10]:
train_raw.index = train_raw['people_id']
people.index = people['people_id']
del train_raw['people_id']
del people['people_id']

#### Transform data to sparse OHE matrix

In [11]:
train_full = pd.merge(train_raw, people, left_index=True, right_index=True)

In [12]:
sum(train_full.memory_usage())*1.0/1024**3

0.9495234340429306

In [13]:
all_values = np.unique(train_full.as_matrix().reshape(1,-1).ravel())
ohe_dict= { k: v for k,v in zip(all_values, xrange(all_values.shape[0])) }

In [15]:
rows = []
cols = []
data = []
for i in xrange(train_full.shape[0]):
    s = map(lambda x: ohe_dict[x], train_full.iloc[i])
    rows += [i]*len(s)
    cols += s
    data += [1]*len(s)

In [16]:
train_full_sp = csr_matrix((np.array(data),
                                (np.array(rows),
                                 np.array(cols))),
                               shape=(train_full.shape[0],len(ohe_dict)))

In [17]:
del rows
del cols
del data
del train_full 
del ohe_dict
del all_values
del train_raw
del people 

#### Split train and validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_full_sp, 
                                                    Y, 
                                                    test_size=0.33, 
                                                    random_state=2)

#### Test logistic regression models

In [None]:
cvalues = [ 1e-6, 1e-4, 1e-2, 1.0, 1e2, 1e4, 1e6]
results = []
for c in cvalues:
    lr = LogisticRegression(C=c, max_iter=1000, tol=1e-9)
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    ll = log_loss(y_test, predictions)
    a = accuracy_score(y_test, predictions)
    results.append({'logloss':ll,'accuracy':a})

In [None]:
pd.DataFrame(results)

#### Test SVM

In [None]:
svm = SVC(C=1.0)
svm.fit(X_train, y_train)

In [None]:
predictions = svm.predict(X_test)
ll = log_loss(y_test, predictions)
a = accuracy_score(y_test, predictions)
print 'Accuracy: {}%, Log-loss: {}'.format(round(a*100,2),round(ll,4))