In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy
from sklearn.metrics import roc_auc_score

In [3]:
from load_problems import load_problem_flight

In [4]:
# trainX, testX, trainY, testY = load_problem_flight(large=True, convert_to_ints=False)

In [5]:
trainX, testX, trainY, testY = load_problem_flight(large=True, convert_to_ints=True)

In [6]:
trainX.ix[:, 'zeros'] = numpy.zeros(len(trainX), dtype=int)
testX.ix[:, 'zeros'] = numpy.zeros(len(testX), dtype=int)

In [7]:
trainX.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,zeros
0,8,3,4,0,15,37,29,1,0
1,6,16,3,8,1,149,157,9,0
2,3,13,7,0,13,29,37,1,0
3,11,27,6,4,14,80,198,7,0
4,7,19,2,1,12,216,155,9,0


In [8]:
# trainX['answer'] = trainY

# grouping = trainX[:100000].groupby(['Origin', 'Dest', 'UniqueCarrier'])

In [9]:
from hep_ml.losses import LogLossFunction
from scipy.special import expit

class CategoricalNN:
    def __init__(self, n_units=10, regularization=100., batch_size=50000, n_iterations=10, lrate=1.0):
        self.n_units = n_units
        self.n_iterations = n_iterations
        self.regularization = regularization
        self.batch_size = batch_size
        self.lrate = lrate
        
    def get_batch_slice(self, X_cat, batch):
        start = numpy.random.randint(0, len(X_cat) - batch + 1)
        return slice(start, start + batch)
        
    def fit(self, X_cat, y):
        X_cat = numpy.array(X_cat)
        self.unit_weights = numpy.random.normal(size=self.n_units)
        self.cat_biases = []
        self.cat_weights = []
        for column in X_cat.T:
            self.cat_biases.append(numpy.random.normal(size=[numpy.max(column) + 1]) * 0.1)
            self.cat_weights.append(numpy.random.normal(size=[numpy.max(column) + 1, self.n_units]) * 0.1)
            
        batch = min(self.batch_size, len(X_cat))
        
        # Training process
        for iteration in range(self.n_iterations):
            batch_slice = self.get_batch_slice(X_cat, batch)
            print batch_slice
            self.make_step(X_cat[batch_slice], y[batch_slice], iteration)
                   
        return self
    
    def make_step(self, X_cat, y, iteration):
        loss = LogLossFunction()
        loss.fit(X_cat, y, y * 0 + 1.)
        unit_predictions, predictions = self.compute_all(X_cat)
        
        for column in range(X_cat.shape[1]):
            inds = X_cat[:, column]
            max_cats = self.cat_biases[column].shape[0]

            grads = loss.negative_gradient(predictions)
            hesss = loss.hessian(predictions)
            nominator = numpy.bincount(inds, weights=grads, minlength=max_cats)
            nominator -= self.regularization * self.cat_biases[column]
            bias_steps =  nominator/ \
                (numpy.bincount(inds, weights=hesss, minlength=max_cats) + self.regularization)
            predictions += bias_steps[inds]
            self.cat_biases[column] += bias_steps

        for unit in range(self.n_units):
            for column in range(X_cat.shape[1]):
                inds = X_cat[:, column]
                unit_outputs, unit_derivs, unit_hesss = self.act_grad_hess(unit_predictions[:, unit])

                unit_weight = self.unit_weights[unit]
                grads = loss.negative_gradient(predictions) * unit_weight
                hesss = loss.hessian(predictions) * unit_weight ** 2

                cat_grads = grads * unit_derivs
                cat_hesss = hesss * (unit_derivs ** 2) + grads * unit_hesss

                max_cats = self.cat_weights[column].shape[0]

                nominator = numpy.bincount(inds, weights=cat_grads, minlength=max_cats)
                nominator -= self.regularization * self.cat_weights[column][:, unit]

                cat_steps =  nominator/ \
                    (numpy.bincount(inds, weights=cat_hesss.clip(0), minlength=max_cats) + self.regularization)
                cat_steps *= self.lrate

                unit_outputs = self.activation(unit_predictions[:, unit])
                predictions -= self.unit_weights[unit] * unit_outputs
                self.cat_weights[column][:, unit] += cat_steps
                unit_predictions[:, unit] += cat_steps[inds]
                unit_outputs = self.activation(unit_predictions[:, unit])
                predictions += self.unit_weights[unit] * unit_outputs

            # updating coefficient for unit
            for updated_unit in [unit]:
                grads = loss.negative_gradient(predictions)
                hesss = loss.hessian(predictions)
                unit_outputs = self.activation(unit_predictions[:, updated_unit])
                nom = numpy.dot(grads, unit_outputs)
                denom = (numpy.dot(hesss, unit_outputs ** 2) + self.regularization)
                step = 0.5 * nom / denom
                self.unit_weights[updated_unit] += step
                predictions += step * unit_outputs
                
            print iteration, unit, loss(predictions)

            new_unit_predictions, new_predictions = self.compute_all(X_cat)
        assert numpy.allclose(predictions, new_predictions)
        assert numpy.allclose(unit_predictions, new_unit_predictions)

    
#     def activation(self, unit_input):
#         return numpy.tanh(unit_input)
    
#     def act_grad_hess(self, unit_input):
#         unit_outputs = numpy.tanh(unit_input)
#         unit_derivs =  (1 - unit_outputs ** 2)
#         unit_hesss =  - 2 * unit_outputs * unit_derivs        
#         return unit_outputs, unit_derivs, unit_hesss
    
    def activation(self, unit_input):
        return unit_input ** 2
    
    def act_grad_hess(self, unit_input):
        unit_outputs = unit_input ** 2
        unit_derivs =  2 * unit_input 
        unit_hesss =   2. + 0 * unit_input
        return unit_outputs, unit_derivs, unit_hesss
    
#     def activation(self, unit_input):
#         return numpy.logaddexp(0, unit_input)
    
#     def act_grad_hess(self, unit_input):
#         unit_outputs = numpy.logaddexp(0, unit_input)
#         unit_derivs =  expit(unit_input)
#         unit_hesss =   unit_derivs * (1. - unit_derivs)
#         return unit_outputs, unit_derivs, unit_hesss
    
    def compute_all(self, X_cat):
        X_cat = numpy.array(X_cat)
        unit_predictions = numpy.zeros([len(X_cat), self.n_units])
        for column, column_weights in enumerate(self.cat_weights):
            for unit in range(self.n_units):
                unit_predictions[:, unit] += column_weights[X_cat[:, column], unit]
        predictions = self.activation(unit_predictions).dot(self.unit_weights)
        for column, column_weights in enumerate(self.cat_biases):
            predictions += column_weights[X_cat[:, column]]
        return unit_predictions, predictions
    
    def decision_function(self, X_cat):
        unit_predictions, predictions = self.compute_all(X_cat)
        return predictions

In [10]:
# x = numpy.array([1, 2, 3])
# print (expit(x + 1e-4) - expit(x)) * 1e4
# print expit(x) * (1 - expit(x))


In [11]:
# unit_predictions, predictions = clf.compute_all(trainX)

In [12]:
# for column in clf.activation(unit_predictions).T:
# for column in unit_predictions.T:
#     hist(column)
#     show()

In [13]:
# trainX.dtypes

In [17]:
clf = CategoricalNN(n_units=30, regularization=100., n_iterations=100, batch_size=1000000, lrate=1.1)

In [18]:
clf.fit(trainX, trainY)

slice(4610610, 5610610, None)
0 0 663763.997497
0 1 663668.533605
0 2 655614.681279
0 3 654599.881558
0 4 652587.274424
0 5 651789.766696
0 6 648670.789061
0 7 648188.019143
0 8 647476.55002
0 9 647272.286132
0 10 646871.338525
0 11 645700.303163
0 12 644712.868242
0 13 644619.43853
0 14 643222.226464
0 15 638811.743402
0 16 638683.168135
0 17 637585.313417
0 18 635699.76317
0 19 635548.87921
0 20 634763.191386
0 21 633755.868729
0 22 631267.384467
0 23 631042.508151
0 24 630655.390601
0 25 627222.574157
0 26 626809.018041
0 27 626012.034776
0 28 625531.268311
0 29 625201.815954
slice(4856050, 5856050, None)
1 0 624859.591586
1 1 624792.389356
1 2 623348.203868
1 3 622919.431582
1 4 622326.359071
1 5 621980.524472
1 6 621103.218445
1 7 620817.273764
1 8 620572.705972
1 9 620398.122626
1 10 620173.407685
1 11 619742.278765
1 12 619460.758659
1 13 619359.055688
1 14 618818.234892
1 15 617930.477337
1 16 617818.509527
1 17 617433.029541
1 18 616846.187435
1 19 616767.175935
1 20 616481.95

KeyboardInterrupt: 

In [19]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY,  clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f23e57fd7e8>
0.751087133865
0.720943905257


In [48]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f568f2afe60>
0.735934133778
0.721468011263


In [44]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f568f2c95f0>
0.738086474224
0.721193077929


In [41]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f568f2c9170>
0.731771269637
0.72065038133


In [25]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f56aecf4248>
0.740230209945
0.720767266292


In [46]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f208f4667a0>
0.735496912711
0.718110408177


In [79]:
# 30 units, 15 iters

print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f625e4b0cf8>
0.754232554815
0.723084829554


In [76]:
# 30 units, 10 iters
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f625e865638>
0.742562118649
0.724197014453


In [64]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f625e82c710>
0.726788577363
0.719745336362


In [64]:
print clf
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

<__main__.CategoricalNN instance at 0x7f625e82c710>
0.726788577363
0.719745336362


In [54]:
print roc_auc_score(trainY, clf.decision_function(trainX))

print roc_auc_score(testY, clf.decision_function(testX))

0.73466349043
0.71752045488


In [41]:
print roc_auc_score(trainY, clf.decision_function(trainX))

print roc_auc_score(testY, clf.decision_function(testX))

0.724158762186
0.719145020432


In [21]:
print roc_auc_score(trainY, clf.decision_function(trainX))
print roc_auc_score(testY, clf.decision_function(testX))

0.73534970466121707

In [12]:
# %%time
# lengths = []
# counter = 0
# for a, b in grouping:
#     if len(b) > 2:
#         print a
#         print b.sort('DepTime')
#         counter += 1
#         if counter > 40:
#             break

In [None]:
lengths[:100]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
trainX.head()

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(trainX)
trainX2 = encoder.transform(trainX)
testX2 = encoder.transform(testX)

In [None]:
clf = KNeighborsClassifier()

In [None]:
clf.fit(trainX2, trainY)

In [None]:
preds = clf.predict_proba(testX2)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(testY, preds)

In [None]:
hashes = numpy.bitwise_xor(trainX.DayOfWeek.map(hash), trainX.UniqueCarrier.map(hash) + trainX.Distance.map(hash))

In [None]:
positives = numpy.bincount(hashes % 100000, weights=trainY)
negatives = numpy.bincount(hashes % 100000, weights=1 - trainY)

In [None]:
trainX2

In [None]:
trainX.sort()
# hashes