In [1]:
from collections import Counter
from functools import partial

import pandas as pd
import numpy as np
import scipy

from statsmodels.formula.api import ols

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output

%matplotlib inline

In [2]:
from sklearn.cross_validation import StratifiedShuffleSplit, ShuffleSplit
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import TruncatedSVD

In [3]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.regularizers import WeightRegularizer, l1, l2
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU, ELU, LeakyReLU
from keras.callbacks import EarlyStopping, Callback

Using Theano backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [4]:
from libscores import pac_metric
import pickle

In [5]:
class WatchlistCallback(Callback):
    def __init__(self, watchlist, eval_metric, epoch_no=10):
        super(Callback, self).__init__()

        self.epoch_no = epoch_no
        self.X, self.y = watchlist
        self.eval_metric = eval_metric
        self.scores = []
        self.epochs = []

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.epoch_no == 0:
            y_pred = self.model.predict(self.X, verbose=0)
            score = self.eval_metric(self.y, y_pred)
            self.scores.append(score)
            self.epochs.append(epoch)

            print "score: %0.6f (epoch no %d)" % (score, epoch)  

class StatusCallback(Callback):    
    def __init__(self, epoch_total):
        super(Callback, self).__init__()
        self.progress = tqdm(total=epoch_total)
        
    def on_epoch_end(self, epoch, logs={}):
        self.progress.update(1)

    def __enter__(self): 
        return self

    def __exit__(self, type, value, traceback):
        self.progress.close()

In [6]:
def read_info(file_name):
    result = []
    
    for line in file(file_name):
        key, value = line.strip().split('=')
        key = key.strip()
        value = value.strip().strip("'")
        if value.isdigit():
            value = int(value)
        result.append((key, value))
    
    return dict(result)

In [7]:
public = read_info('data/set4_tania/tania_public.info')
public

{'feat_num': 47236,
 'feat_type': 'Numerical',
 'has_categorical': 0,
 'has_missing': 0,
 'is_sparse': 1,
 'label_num': 95,
 'metric': 'pac_metric',
 'name': 'tania',
 'target_num': 95,
 'target_type': 'Binary',
 'task': 'multilabel.classification',
 'test_num': 44635,
 'time_budget': 1200,
 'train_num': 157599,
 'usage': 'AutoML challenge 2014',
 'valid_num': 22514}

In [8]:
dim = public['feat_num']

def read_sparse_features(fine_name):
    result = list()
    for line in file(fine_name):
        row = []
        for el in line.strip().split(' '): 
            pos, value = el.split(':')
            pos = int(pos) - 1
            row.append((pos, float(value)))
        result.append(row)

    rnum = len(result)
    X = scipy.sparse.dok_matrix((rnum, dim), dtype=np.float)
    for idx, row in enumerate(result):
        for pos, val in row:
            X[idx, pos] = val

    return scipy.sparse.csr_matrix(X)

In [9]:
def read_multilabels(file_name):
    result = list()
    for line in file(file_name):
        row = [int(f) for f in line.strip().split(' ')]
        result.append(row)
    return scipy.sparse.csr_matrix(result)

In [10]:
X, y, X_valid, X_test = pickle.load(open('data/set4_tania/data.pickle', 'rb'))

X = read_sparse_features('data/set4_tania/tania_train.data')
y = read_multilabels('data/set4_tania/tania_train.solution')

X_valid = read_sparse_features('data/set4_tania/tania_valid.data')
X_test  = read_sparse_features('data/set4_tania/tania_test.data')

print X.shape, y.shape, np.unique(y).shape
print X_valid.shape, X_test.shape

pickle.dump([X, y, X_valid, X_test], open('data/set4_tania/data.pickle', 'wb'))

In [11]:
rows, cols = X.nonzero()
per_col_count = np.bincount(cols)

X = X[:, per_col_count > 5]
X_valid = X_valid[:, per_col_count > 5]
X_test = X_test[:, per_col_count > 5]

In [12]:
svd = TruncatedSVD(n_components=500, random_state=123)
X = svd.fit_transform(X)
X_valid = svd.transform(X_valid)
X_test = svd.transform(X_test)

In [13]:
scaler = StandardScaler(with_mean=True, copy=False)
X = scaler.fit_transform(X)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [14]:
train_size = 0.8
EPOCHS = 1500
n, y_dim = y.shape

cv = ShuffleSplit(n, n_iter=1, train_size=train_size, random_state=1)
train, test = next(cv.__iter__())

In [15]:
X_train_svd = X[train]
X_test_svd = X[test]

In [16]:
model = Sequential()

model.add(Dense(input_dim=X_train_svd.shape[1], output_dim=300, init='uniform')) 
model.add(Activation('tanh')) 
model.add(Dropout(0.2)) 
model.add(Dense(input_dim=300, output_dim=150, init='uniform')) 
model.add(Activation('sigmoid')) 
model.add(Dropout(0.1)) 
model.add(Dense(output_dim=y.shape[1], init='uniform')) 
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='adagrad')

eval_metric = partial(pac_metric, task='multilabel.classification')
# score = pac_metric(y[test].toarray(), y_pred, task='multilabel.classification')

watchlist = WatchlistCallback(watchlist=(X_test_svd, y[test].toarray()), 
                              eval_metric=eval_metric, epoch_no=1)

In [17]:
model.fit(X_train_svd, y[train].toarray(), nb_epoch=EPOCHS, batch_size=10000, callbacks=[watchlist],
          verbose=0)

score: -0.458257 (epoch no 0)
score: -0.045074 (epoch no 1)
score: 0.218930 (epoch no 2)
score: 0.371278 (epoch no 3)
score: 0.454003 (epoch no 4)
score: 0.457272 (epoch no 5)
score: 0.449189 (epoch no 6)
score: 0.448582 (epoch no 7)
score: 0.451857 (epoch no 8)
score: 0.457130 (epoch no 9)
score: 0.462126 (epoch no 10)
score: 0.468638 (epoch no 11)
score: 0.476160 (epoch no 12)
score: 0.482081 (epoch no 13)
score: 0.487642 (epoch no 14)
score: 0.494570 (epoch no 15)
score: 0.500458 (epoch no 16)
score: 0.506436 (epoch no 17)
score: 0.511700 (epoch no 18)
score: 0.516824 (epoch no 19)
score: 0.522586 (epoch no 20)
score: 0.527302 (epoch no 21)
score: 0.532023 (epoch no 22)
score: 0.536366 (epoch no 23)
score: 0.540775 (epoch no 24)
score: 0.544793 (epoch no 25)
score: 0.547970 (epoch no 26)
score: 0.552781 (epoch no 27)
score: 0.555928 (epoch no 28)
score: 0.559756 (epoch no 29)
score: 0.563463 (epoch no 30)
score: 0.566471 (epoch no 31)
score: 0.569765 (epoch no 32)
score: 0.572619 (e

KeyboardInterrupt: 

In [18]:
EPOCHS = 600

model = Sequential()

model.add(Dense(input_dim=X_train_svd.shape[1], output_dim=300, init='uniform')) 
model.add(Activation('tanh')) 
model.add(Dropout(0.2)) 
model.add(Dense(input_dim=300, output_dim=150, init='uniform')) 
model.add(Activation('sigmoid')) 
model.add(Dropout(0.1)) 
model.add(Dense(output_dim=y.shape[1], init='uniform')) 
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='adagrad')

with StatusCallback(epoch_total=EPOCHS) as status:
    model.fit(X, y.toarray(), nb_epoch=EPOCHS, batch_size=10000, callbacks=[status], verbose=0)



In [19]:
y_valid_pred = model.predict(X_valid)
y_test_pred = model.predict(X_test)

In [20]:
np.savetxt('submission/tania_valid.predict', y_valid_pred, fmt='%0.10f')
np.savetxt('submission/tania_test.predict', y_test_pred, fmt='%0.10f')