In [1]:
from collections import Counter

import pandas as pd
import numpy as np
import scipy

from statsmodels.formula.api import ols

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output

%matplotlib inline

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, label_binarize
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, NMF, TruncatedSVD

from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.ensemble import ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

In [3]:
from libscores import pac_metric
import pickle

In [4]:
def read_info(file_name):
    result = []
    
    for line in file(file_name):
        key, value = line.strip().split('=')
        key = key.strip()
        value = value.strip().strip("'")
        if value.isdigit():
            value = int(value)
        result.append((key, value))
    
    return dict(result)

In [5]:
public = read_info('data/set4_tania/tania_public.info')
public

{'feat_num': 47236,
 'feat_type': 'Numerical',
 'has_categorical': 0,
 'has_missing': 0,
 'is_sparse': 1,
 'label_num': 95,
 'metric': 'pac_metric',
 'name': 'tania',
 'target_num': 95,
 'target_type': 'Binary',
 'task': 'multilabel.classification',
 'test_num': 44635,
 'time_budget': 1200,
 'train_num': 157599,
 'usage': 'AutoML challenge 2014',
 'valid_num': 22514}

In [6]:
dim = public['feat_num']

def read_sparse_features(fine_name):
    result = list()
    for line in file(fine_name):
        row = []
        for el in line.strip().split(' '): 
            pos, value = el.split(':')
            pos = int(pos) - 1
            row.append((pos, float(value)))
        result.append(row)

    rnum = len(result)
    X = scipy.sparse.dok_matrix((rnum, dim), dtype=np.float)
    for idx, row in enumerate(result):
        for pos, val in row:
            X[idx, pos] = val

    return scipy.sparse.csr_matrix(X)

In [7]:
def read_multilabels(file_name):
    result = list()
    for line in file(file_name):
        row = [f for f in line.strip().split(' ')]
        result.append(row.index('1'))
    return np.array(result)

In [8]:
X, y, X_valid, X_test = pickle.load(open('data/set4_tania/data.pickle', 'rb'))

X = read_sparse_features('data/set4_tania/tania_train.data')
y = read_multilabels('data/set4_tania/tania_train.solution')

X_valid = read_sparse_features('data/set4_tania/tania_valid.data')
X_test  = read_sparse_features('data/set4_tania/tania_test.data')

print X.shape, y.shape, np.unique(y).shape
print X_valid.shape, X_test.shape

pickle.dump([X, y, X_valid, X_test], open('data/set4_tania/data.pickle', 'wb'))

In [9]:
classes = np.unique(y)

In [10]:
rows, cols = X.nonzero()
per_col_count = np.bincount(cols)

X = X[:, per_col_count > 5]
X_valid = X_valid[:, per_col_count > 5]
X_test = X_test[:, per_col_count > 5]

In [11]:
scaler = StandardScaler(with_mean=False, copy=False)
X = scaler.fit_transform(X)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

Learning curves

In [None]:
n_iter = 5
cv = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=0.25, random_state=1)

scores = []
it = 1
for train, test in cv:
    print 'iteration %d' % it

    model = LogisticRegression(n_jobs=-1, penalty='l1', C=0.01)
    
    X_train = X[train]
    y_train = y[train]
    N, d = X_train.shape

    sample_size = list(range(10000, N, 10000))
    for j in tqdm(sample_size):
        X_sample, _, y_sample, _ = \
            train_test_split(X_train, y_train, train_size=j, random_state=1, stratify=y_train)
        model.fit(X_sample, y_sample)
        y_pred = model.predict_proba(X[test])

        score = pac_metric(label_binarize(y[test], classes=classes), y_pred)
        scores.append((it, j, score))

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X[test])
    score = pac_metric(label_binarize(y[test], classes=classes), y_pred)
    scores.append((it, N, score))

    it = it + 1
clear_output()

iteration 1


 36%|███▋      | 4/11 [01:03<01:49, 15.71s/it]

In [22]:
learning

NameError: name 'learning' is not defined

In [24]:
learning = pd.DataFrame(scores, columns=['it', 'sample_size', 'score'])

for it, grp in learning.groupby('it'):
    plt.plot(grp.sample_size, grp.score)

mean_score = learning.groupby('sample_size').score.mean()
plt.plot(mean_score.index, mean_score.values, color='black', linewidth=2)

plt.title('Learning curves')
plt.xlabel('Sample size')
plt.ylabel('Score')

plt.show()

ValueError: Shape of passed values is (1, 5), indices imply (3, 5)

In [12]:
X_input = X

In [13]:
results = []

In [25]:
models = {
    'logreg': LogisticRegression(),
}

params = {
    'cv_train_size': 8000,
    'model': 'logreg'
}

pipeline = Pipeline([
    ('model', models[params['model']])
])

pipe_params = {
    'model__penalty': 'l1',
    'model__C': 5,
}
pipeline.set_params(**pipe_params)

n_iter = 5
cv = StratifiedShuffleSplit(y, n_iter=n_iter, train_size=params['cv_train_size'], random_state=1)

scores = []
for train, test in tqdm(cv):
    pipeline.fit(X_input[train], y[train])
    y_pred = pipeline.predict_proba(X_input[test])
    score = pac_metric(label_binarize(y[test], classes=classes), y_pred)
    scores.append(score)

params.update(pipe_params)
results.append((params, np.mean(score), np.std(score)))
print np.mean(score), np.std(score)



0.327397312688 0.0


In [26]:
for p, s, std in reversed(results[-5:]):
    print u'score: %0.5f ± %0.5f, params: %s' % (s, std, p)

score: 0.32740 ± 0.00000, params: {'model__C': 5, 'model': 'logreg', 'model__penalty': 'l1', 'cv_train_size': 8000}
score: 0.28432 ± 0.00000, params: {'model__C': 10, 'model': 'logreg', 'model__penalty': 'l1', 'cv_train_size': 8000}
score: 0.43695 ± 0.00000, params: {'model__C': 1, 'model': 'logreg', 'model__penalty': 'l1', 'cv_train_size': 8000}
score: 0.32191 ± 0.00000, params: {'model__C': 0.01, 'model': 'logreg', 'model__penalty': 'l1', 'cv_train_size': 8000}


In [None]:
pipeline.fit(X, y)

In [46]:
y_valid_score = pipeline.predict_proba(X_valid)[:, 1]
y_test_score = pipeline.predict_proba(X_test)[:, 1]


In [48]:
np.savetxt('submission/helena_valid.predict', y_valid_score, fmt='%0.18f')
np.savetxt('submission/helena_test.predict', y_test_score, fmt='%0.18f')