In [1]:
from collections import Counter

import pandas as pd
import numpy as np
import scipy

from statsmodels.formula.api import ols

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output

%matplotlib inline

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, label_binarize
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, NMF, TruncatedSVD

from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit, ShuffleSplit

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.ensemble import ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb

In [3]:
from libscores import pac_metric
import pickle

In [4]:
def read_info(file_name):
    result = []
    
    for line in file(file_name):
        key, value = line.strip().split('=')
        key = key.strip()
        value = value.strip().strip("'")
        if value.isdigit():
            value = int(value)
        result.append((key, value))
    
    return dict(result)

In [5]:
public = read_info('data/set4_tania/tania_public.info')
public

{'feat_num': 47236,
 'feat_type': 'Numerical',
 'has_categorical': 0,
 'has_missing': 0,
 'is_sparse': 1,
 'label_num': 95,
 'metric': 'pac_metric',
 'name': 'tania',
 'target_num': 95,
 'target_type': 'Binary',
 'task': 'multilabel.classification',
 'test_num': 44635,
 'time_budget': 1200,
 'train_num': 157599,
 'usage': 'AutoML challenge 2014',
 'valid_num': 22514}

In [6]:
dim = public['feat_num']

def read_sparse_features(fine_name):
    result = list()
    for line in file(fine_name):
        row = []
        for el in line.strip().split(' '): 
            pos, value = el.split(':')
            pos = int(pos) - 1
            row.append((pos, float(value)))
        result.append(row)

    rnum = len(result)
    X = scipy.sparse.dok_matrix((rnum, dim), dtype=np.float)
    for idx, row in enumerate(result):
        for pos, val in row:
            X[idx, pos] = val

    return scipy.sparse.csr_matrix(X)

In [7]:
def read_multilabels(file_name):
    result = list()
    for line in file(file_name):
        row = [int(f) for f in line.strip().split(' ')]
        result.append(row)
    return scipy.sparse.csr_matrix(result)

In [8]:
X, y, X_valid, X_test = pickle.load(open('data/set4_tania/data.pickle', 'rb'))

X = read_sparse_features('data/set4_tania/tania_train.data')
y = read_multilabels('data/set4_tania/tania_train.solution')

X_valid = read_sparse_features('data/set4_tania/tania_valid.data')
X_test  = read_sparse_features('data/set4_tania/tania_test.data')

print X.shape, y.shape, np.unique(y).shape
print X_valid.shape, X_test.shape

pickle.dump([X, y, X_valid, X_test], open('data/set4_tania/data.pickle', 'wb'))

In [9]:
rows, cols = X.nonzero()
per_col_count = np.bincount(cols)

X = X[:, per_col_count > 5]
X_valid = X_valid[:, per_col_count > 5]
X_test = X_test[:, per_col_count > 5]

In [10]:
scaler = StandardScaler(with_mean=False, copy=False)
X = scaler.fit_transform(X)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [11]:
X_input = X

In [12]:
def column(y, n):
    return y[:, n].toarray().reshape(-1)

In [19]:
base_model = LogisticRegression(penalty='l1', C=0.03)

n_iter = 5
n, y_dim = y.shape
cv = ShuffleSplit(n, n_iter=n_iter, train_size=15000, random_state=1)

scores = []
for iter_no, (train, test) in enumerate(cv):
    print 'iteration %d' % iter_no

    y_pred = np.zeros(y[test].shape)
    for i in tqdm(range(y_dim)):
        y_i = column(y[train], i)
        base_model.fit(X_input[train], y_i)    
        y_pred[:, i] = base_model.predict_proba(X_input[test])[:, 1]

    score = pac_metric(y[test].toarray(), y_pred, task='multilabel.classification')
    print 'score %0.5f' % score
    scores.append(score)

print np.mean(scores), np.std(scores)

iteration 0




score 0.59379
iteration 1




score 0.59561
iteration 2




score 0.59554
iteration 3




score 0.60041
iteration 4




score 0.59904
0.596877948831 0.0024538743718


In [20]:
y_test_pred = np.zeros((X_test.shape[0], y_dim))
y_valid_pred = np.zeros((X_valid.shape[0], y_dim))

for i in tqdm(range(y_dim)):
    y_i = column(y, i)
    base_model.fit(X_input, y_i)    
    y_test_pred[:, i] = base_model.predict_proba(X_test)[:, 1]
    y_valid_pred[:, i] = base_model.predict_proba(X_valid)[:, 1]



In [21]:
np.savetxt('submission/tania_valid.predict', y_valid_pred, fmt='%0.10f')
np.savetxt('submission/tania_test.predict', y_test_pred, fmt='%0.10f')