In [114]:
from scipy import sparse
def OneHotEncoder1(data, keymap=None):
    """
    OneHotEncoder takes data matrix with categorical columns and
    converts it to a sparse binary matrix.

    Returns sparse binary matrix and keymap mapping categories to indicies.
    If a keymap is supplied on input it will be used instead of creating one
    and any categories appearing in the data that are not in the keymap are
    ignored
    """
    if keymap is None:
      keymap = []
      for col in data.T:
           uniques = set(list(col))
           keymap.append(dict((key, i) for i, key in enumerate(uniques)))
    total_pts = data.shape[0]
    outdat = []
    for i, col in enumerate(data.T):
      km = keymap[i]
      num_labels = len(km)
      spmat = sparse.lil_matrix((total_pts, num_labels))
      for j, val in enumerate(col):
           if val in km:
                spmat[j, km[val]] = 1
      outdat.append(spmat)
    outdat = sparse.hstack(outdat).tocsr()
    return outdat, keymap


In [1]:
import pandas as pd
train = pd.read_csv('train.csv')

In [2]:
train.head()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [3]:
from itertools import chain
def generate_combs(s, k):
    if k == 0 or k > len(s):
        yield []
        return 
    
    for i in range(len(s)-k+1):
        for c in generate_combs(s[i+1:], k-1):
            yield [s[i]] + c
                    
                    
    

In [4]:
list(generate_combs(range(5), 3))

[[0, 1, 2],
 [0, 1, 3],
 [0, 1, 4],
 [0, 2, 3],
 [0, 2, 4],
 [0, 3, 4],
 [1, 2, 3],
 [1, 2, 4],
 [1, 3, 4],
 [2, 3, 4]]

In [5]:
import numpy as np
def concatination_features(data, degree=2):
    """
    Create new features by concatinating existing features
    from data. This is similiar to polinomial feature generation
    but can be used with categorical data as well.
    
    Parameters:
    -----------
    data: numpy array, contains data with features to be concatinated
    
    degree: int, number of features to concatinate to generate each new
    feature. For example, if there are three features 'a', 'b', and 'c'
    and degree=2, generated features are 'ab', 'ac', 'bc'.
    
    """
    m, n = data.shape
    
    feature_combs = generate_combs(range(n), degree)
    
    return np.vstack(
                    [np.array([hash(tuple(row)) 
                               for row in data[:, comb].tolist()])]
                                   for comb in feature_combs
    ).T

In [6]:
data = train.as_matrix()
m, n = data.shape

feature_combs = generate_combs(range(n), k=1)


In [11]:
concatination_features(train.as_matrix(), degree=2)

array([[3713081593804632581, 3713081688086064931, 3713081652919157781,
        ..., 3802992883545139856, 3802993053234181181,
        3415903829100288256],
       [3713081613378849631, 3713081632495158606, 3713081652919157781,
        ..., 3802583751228827281, 3802584108805237706,
        3468288415453038256],
       [3713081599014825406, 3713081638629827781, 3713081652639866331,
        ..., 3423292086759634856, 3423292121901643931,
        3696722415271732506],
       ...,
       [3713081598711718406, 3713081618499192881, 3713081652919157781,
        ..., 3833117828409294481, 3833117821922804681,
        3802510451410774931],
       [3713081708815336156, 3713081576308863531, 3713081652634453706,
        ..., 3413839132170712606, 3413839515002431281,
        3414695830855374706],
       [3713081638602764656, 3713081589658561831, 3713081652872609206,
        ..., 3798693361501076906, 3798693254492398131,
        3696722416048985456]])

In [12]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]
X_test = test.iloc[:, 1:]


In [13]:
def make_poli_features(data, max_degree):
    data_matrix = data.as_matrix()
    data_poli_feature = [data_matrix]
    for d in range(2, max_degree+1):
        data_poli_feature.append(concatination_features(data_matrix, degree=d))

    return np.hstack(data_poli_feature)    

In [135]:
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin

class HighOrderFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2):
        self.degree = degree
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.as_matrix()
        elif isinstance(X, list):
            X = np.array(X)
            
        X_poli_feature = [X]
        for d in range(2, self.degree+1):
            X_poli_feature.append(concatination_features(X, degree=d))

        return np.hstack(X_poli_feature)            

In [134]:
hof = HighOrderFeatures()
hof.fit_transform(X_train)

<class 'pandas.core.frame.DataFrame'>


array([[              39353,               85475,              117961,
        ..., 3802992883545139856, 3802993053234181181,
        3415903829100288256],
       [              17183,                1540,              117961,
        ..., 3802583751228827281, 3802584108805237706,
        3468288415453038256],
       [              36724,               14457,              118219,
        ..., 3423292086759634856, 3423292121901643931,
        3696722415271732506],
       ...,
       [              34924,               28805,              117961,
        ..., 3833117828409294481, 3833117821922804681,
        3802510451410774931],
       [              80574,               55643,              118256,
        ..., 3413839132170712606, 3413839515002431281,
        3414695830855374706],
       [              14354,               59575,              117916,
        ..., 3798693361501076906, 3798693254492398131,
        3696722416048985456]])

In [105]:
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.fitted = False
    
    def fit(self, X, y=None):
        col_uniques = [np.unique(c) for c in X.T]
        self.map = [dict(zip(c, range(len(c)))) for c in col_uniques]
        self.fitted = True
        return self
        
    def transform(self, X):
        if not self.fitted:
            raise Exception("Transformer not fitted.")
        
        n_samples, n_features = X.shape
        if len(self.map) != n_features:
            raise Exception("X must have the same number of columns \
                                as the matrix used to fit the transformer.")
        
        res = []
        for i, col in enumerate(X.T):
            col_indices = np.vectorize(self.map[i].get)(col)
            row_indices = np.arange(n_samples)
            ones = np.ones(n_samples)
            col_encoded = sparse.coo_matrix((ones, (row_indices, col_indices)))
            res.append(col_encoded)
            
        return sparse.hstack(res).tocsr()
            

In [103]:
from sklearn.preprocessing import OneHotEncoder
max_degree = 2
X_train_poli = make_poli_features(X_train, max_degree)
X_test_poli = make_poli_features(X_test, max_degree)



In [19]:
d = np.array([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])

ohe = OneHotEncoder() 
ohe.fit_transform(d)


<4x9 sparse matrix of type '<type 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [20]:
ohe.fit(d)

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [106]:
from scipy.sparse import csr_matrix, bsr_matrix
ohe = OneHotEncoder() 
ohe.fit_transform(X_test_poli)


<58921x286821 sparse matrix of type '<type 'numpy.float64'>'
	with 2651445 stored elements in Compressed Sparse Row format>

In [136]:

def one_hot_endcode(X_train, X_test):
    ohe = OneHotEncoder()
    X_total = np.vstack([X_train, X_test])
    X_total_ohe = ohe.fit_transform(X_total)
    n_train = X_train.shape[0]
    X_train_ohe = X_total_ohe[:n_train,]
    X_test_ohe = X_total_ohe[n_train:,]
    return X_train_ohe, X_test_ohe

X_train_ohe, X_test_ohe = one_hot_endcode(X_train_poli, X_test_poli)


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

def scoring(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    return roc_auc_score(y, y_pred[:, 1])

clf = LogisticRegression()
gs = GridSearchCV(clf, param_grid={'C': np.logspace(-1, 1, 10)}, scoring=scoring)

In [39]:
gs.fit(X_train_ohe, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.1    ,  0.16681,  0.27826,  0.46416,  0.77426,  1.29155,
        2.15443,  3.59381,  5.99484, 10.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=<function scoring at 0x7f63eabcbc80>, verbose=0)

In [40]:
gs.best_params_

{'C': 0.2782559402207124}

In [42]:

#clf = LogisticRegression(C=3)
clf = gs.best_estimator_
cross_val_score(clf, X_train_ohe, y_train, n_jobs=4)

array([0.95294764, 0.94918978, 0.94964292])

In [167]:
def make_submission_file(y_test_pred, filename=None):
    submit_data = pd.DataFrame(columns=['ID', 'ACTION'])
    submit_data['ID'] = xrange(1, len(y_test_pred)+1)
    submit_data['ACTION'] = y_test_pred
    if not filename:
        filename = 'submit.csv'
    submit_data.to_csv(filename, index=False)

In [192]:
from scipy.sparse import vstack

n_stack = 2
X_train_ohe_double = vstack([X_train_ohe for _ in range(n_stack)])
y_train_double = np.vstack([y_train.reshape(-1, 1) for _ in range(n_stack)]).ravel()

# Shuffle train data

n_train = X_train_ohe_double.shape[0]
shuffle_index = np.random.permutation(n_train)
X_train_ohe = X_train_ohe_double[shuffle_index]
y_train = y_train_double[shuffle_index]


In [223]:
clf.fit(X_train_ohe, y_train)
y_test_pred = clf.predict_proba(X_test_ohe)

In [224]:
make_submission_file(y_test_pred, 'submit/submit_poli2_gs.csv')

In [226]:
##TODO
## Try RF and XGB with GridSearch 

In [140]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

def scoring(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    return roc_auc_score(y, y_pred[:, 1])

hof = HighOrderFeatures()
clf = LogisticRegression()

estimators = [('hof', hof), ('clf', clf)]
pl = Pipeline(estimators)

In [138]:
gs = GridSearchCV(pl, param_grid={'clf__C': np.logspace(-1, 1, 10)}, scoring=scoring)


In [141]:
gs.fit(X_train_ohe, y_train)

AttributeError: tolist not found