In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('../input/train.csv.gz')
test = pd.read_csv('../input/test.csv.gz')

In [None]:
train_checks = pd.read_csv('../input/train_checks.csv.gz')
test_checks = pd.read_csv('../input/test_checks.csv.gz')

In [None]:
train = train.merge(train_checks, on = 'check_id', how = 'left')
test = test.merge(test_checks, on = 'check_id', how = 'left')

In [None]:
print(train.shape, train_checks.shape)
print(train.columns.values)
print(train_checks.columns.values)

In [None]:
print(test.shape, test_checks.shape)
print(test.columns.values)

In [None]:
train.fillna('', inplace=True)
test.fillna('', inplace=True)

In [None]:
catalog = pd.read_csv('../input/catalog2.csv.gz')
catalog.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(train['name'])
X_test = vectorizer.transform(test['name'])

In [None]:
X_catalog = vectorizer.transform(catalog.description.fillna(""))

In [None]:
from sklearn.preprocessing import LabelEncoder

catalog_labeler = LabelEncoder()
y_catalog = catalog_labeler.fit_transform(catalog.category)

In [None]:
X.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

labeler = LabelEncoder()

In [None]:
y = labeler.fit_transform(train.category)

# Часто бывает полезно загрубить предсказания, чтобы не получать большие штрафы в случае ошибок, такое загрубление называется clipping

In [None]:
from sklearn.metrics import log_loss, make_scorer
clipping = 0.001

clipped_log_loss = make_scorer(log_loss, eps = clipping, greater_is_better = False, needs_proba = True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score


In [None]:
parameters = {
    'C' : np.logspace(0, 3, 4),
}

gkf = list(GroupKFold(n_splits=4).split(X, y, train.check_id.values))
score = cross_val_score(LogisticRegression(C = 100), X, y, cv = gkf, scoring=clipped_log_loss)
-np.mean(score)

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
X_meta = cross_val_predict(LogisticRegression(C = 100), X, y, cv=gkf, n_jobs = -1, method = 'predict_proba')

In [None]:
X_meta.shape

# Мета модель уточняющая предсказания логистической регрессии

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators = 40)

score = cross_val_score(xgb, X_meta, y, cv = gkf, scoring=clipped_log_loss)

In [None]:
-np.mean(score)

# Смесь предсказаний двух моделей с _разными_ наборами категорий через мета модель

In [None]:
model_catalog = LogisticRegression(C = 100)
model_catalog.fit(X_catalog, y_catalog)

X_meta_catalog = model_catalog.predict_proba(X)

In [None]:
X_meta_catalog.shape

In [None]:
score = cross_val_score(xgb, np.hstack([X_meta, X_meta_catalog]), y, cv = gkf, scoring=clipped_log_loss)

-np.mean(score)

# Пример сабмита
## На train предсказываем на манер cross_var_predict, на test предсказываем моделями каждого из фолдов и усредняем

In [None]:
X_meta = np.zeros((X.shape[0], 25))
X_test_meta = []

for fold_i, (train_i, test_i) in enumerate(gkf):
    print(fold_i)
    model = LogisticRegression(C = 100)
    model.fit(X.tocsr()[train_i], y[train_i])
    X_meta[test_i, :] = model.predict_proba(X.tocsr()[test_i])
    X_test_meta.append(model.predict_proba(X_test))

In [None]:
X_test_meta = np.stack(X_test_meta)
X_test_meta.shape

In [None]:
X_test_meta_mean = np.mean(X_test_meta, axis = 0)

In [None]:
X_test_meta_mean.shape

# Собираем предсказания разных моделей в мета-признаки на train и test

In [None]:
X_meta = np.hstack([X_meta, X_meta_catalog])

In [None]:
X_test_meta_catalog = model_catalog.predict_proba(vectorizer.transform(test.name))

In [None]:
X_test_meta = np.hstack([X_test_meta_mean, X_test_meta_catalog])

In [None]:
xgb.fit(X_meta, y)

In [None]:
p_test = xgb.predict_proba(X_test_meta)

In [None]:
def form_predictions(p):
    return ['%.6f' % x for x in p]

In [None]:
test_submission = test[['id']]

for i, c in enumerate(labeler.classes_):
    p = p_test[:, i]
    p[p < clipping] = clipping
    p[p > (1.0 - clipping)] = (1.0 - clipping)
    test_submission[c] = form_predictions(p)

In [None]:
test_submission.to_csv('meta_model_extended.csv.gz', compression='gzip', index = False)