In [2]:
from multiprocessing.dummy import Pool, Queue
import lxml
import tldextract
from tqdm import tqdm
from time import sleep
from bs4 import BeautifulSoup
from bs4.element import Comment
from contextlib import ExitStack
from typing import Generator, Dict, Any
import gzip
import pandas as pd
import codecs
import sys
import os
import json
import re
import numpy as np
from os import listdir
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import GermanStemmer, EnglishStemmer, RussianStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from time import time
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
regex_symbols = re.compile(r'[^a-zA-Z0-9а-яА-Я\u00E4\u00F6\u00FC\u00C4\u00D6\u00DC\u00df]')
stopwords_new = ['http', 'url', 'img','html', 'https', 'org', 'www', 'jpg', 'png', 'net','com','php', 'uid','src', 'ahttp', 'index', 'htm']
pattern = re.compile(r'\b(' + r'|'.join(stopwords_new) + r')\b\s*')
shortword = re.compile(r'\W*\b\w{1,2}\b')
musor = re.compile(r'\d{1}(\w{2})\d{2,4}')

In [3]:
stemmer_ru = RussianStemmer()
stemmer_eng = EnglishStemmer()
stemmer_ger = GermanStemmer()

In [4]:
stop_words = set(stopwords.words(['english', 'russian', 'german']))

In [7]:
my_position_start = 1
my_position_end = 28027
queue = Queue() # очередь ссылок на книги
for i in [426, 1393, 3345, 3800, 6085, 24245]:
    queue.put(i)

In [8]:
#split title to words
def split_title(title):
    words = nltk.word_tokenize(title)
    without_extra_words = [stemmer_ru.stem(stemmer_eng.stem(stemmer_ger.stem(word))) for word in words if word not in stop_words]
    return without_extra_words

In [9]:
#split text to words
def split_text(text):
    words = nltk.word_tokenize(text)
    without_extra_words = [stemmer_ru.stem(stemmer_eng.stem(stemmer_ger.stem(word))) for word in words if word not in stop_words]
    return without_extra_words

In [10]:
#returns words which title contains and text contains
def get_content(title, text, n=None):
    title_words = split_title(title)
    text_words = split_text(text)
    if n is not None:
        vectorizer = CountVectorizer().fit(text_words)
        tmp_words_matrix = vectorizer.transform(text_words)
        tmp_words_count = np.sum(tmp_words_matrix, axis=0)
        tmp_words = [(word, tmp_words_count[0, ind]) for word, ind in vectorizer.vocabulary_.items() if len(word) > 2]
        tmp_words = sorted(tmp_words, reverse=True, key=lambda x: x[1])
        tmp_words = [word[0] for word in tmp_words[:n]]
        doc_words = title_words + tmp_words
    else:
        doc_words = title_words + text_words
    return doc_words

In [11]:
def process_doc(number_of_doc):
    with open('./content/{:d}.dat'.format(number_of_doc), encoding='utf-8') as file:
        text_checker = {}
        html_test = file.read()
        soup = BeautifulSoup(html_test, 'html.parser')
        if soup.title:
            title_name = soup.title.text
            title_name = regex_symbols.sub(" ", title_name)
            title_name = re.sub("\s\s+" , " ", title_name)
        else:
            title_name = ' '
        url = soup.text[:soup.text.index('\n')]
        url = tldextract.extract(url)
        url = url.domain + '.' + url.suffix
        text = soup.text[soup.text.index('\n'):].lower()
        text = regex_symbols.sub(" ", text)
        text = shortword.sub(" ", text)
        text = pattern.sub(" ", text)
        text = musor.sub(' ', text)
        text = re.sub("\s\s+" , " ", text)
        text = re.sub('\xa0|\xad', ' ', text)
        content = get_content(title_name, text, n=25)
        text_checker[number_of_doc] = [url] + content
        return text_checker

In [12]:
def process_all_docs(i):
    with gzip.open('data_bad/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            try:
                id_new = queue.get()
                record = process_doc(id_new)
            except Exception as e:
                print(id_new, file=sys.stderr)
                print(e, file=sys.stderr)
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)

            # счетчик должен атомарно обновиться
            with lock:
                pbar.update(1)


with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_all_docs, range(pool._processes))

100%|██████████| 6/6 [00:02<00:00,  2.83it/s]


In [111]:
def records_reader(dirname: str) -> Generator[Dict[str, Any], None, None]:
    with ExitStack() as stack:
        files = [stack.enter_context(gzip.open(dirname + '/' + i, mode='rb')) for i in tqdm(listdir(dirname))]
        for j in files:
            d = codecs.getreader('utf8')(j)
            for k in d:
                yield json.loads(k)

In [256]:
df = pd.DataFrame(records_reader('data_bad'))
#df.to_csv('prom_res.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 160.42it/s]


In [163]:
set([i for i in range(1, 28027)]) - set(np.unique(df.columns.values.astype('int')))

{2073, 5120, 10114, 14030, 19562, 24245}

In [242]:
df1 = pd.DataFrame(columns=['id', 'words'])

In [243]:
df1

Unnamed: 0,id,words


In [257]:
dicter = {}

In [258]:
for i in range(df.shape[0]):
    for j, k in df.loc[i][~df.loc[i].isna()].items():
        dicter[str(j)] = ' '.join(k)

In [259]:
dicter

{'5120': 'nukucihumun.tk сумм подотчетн лиц организац счет работник отчет расход расчет средств денежн наличн выда учет выдач командировк котор документ авансов сотрудник',
 '10114': 'onliner.by memb профил пользовател лет ден офлайн сайт техник рожден город отправ личн сообщен август пленк senior окн средств neman очистк',
 '19562': 'mysonce.ru скача gta samp e торрент gta samp скача игр san andrea торрент multiplay сервер верс как папк файл sa da кача гта grand theft auto',
 '14030': 'onliner.by объявлен memb год пользовател autoc senior масл техник офлайн сайт ден рожден город профил отправ личн сообщен лет цен merc',
 '24245': 'sci-article.ru анализ нарушен иммунологическ реактивн у дет с заболеван почек дет cd лимфоцит кров хроническ групп иммунологическ показател гломерулонефрит воспалительн процесс стат активац ig анализ помощ иммун корреляцион числ сывороточн'}

In [276]:
df1

Unnamed: 0,id,words
0,1,zrenielib.ru м б аншин центр репродукц генетик...
1,17,nashizubki.ru современ стоматолог кто так стом...
2,18,yaplakal.com есл счетчик яплакал счетчик плат ...
3,23,det-sad45.ru прошивк dexp инструкц прошивк dex...
4,25,tks.ru медицинск издел формулировк нов ру арх ...
...,...,...
28020,5120,nukucihumun.tk сумм подотчетн лиц организац сч...
28021,10114,onliner.by memb профил пользовател лет ден офл...
28022,19562,mysonce.ru скача gta samp e торрент gta samp с...
28023,14030,onliner.by объявлен memb год пользовател autoc...


In [266]:
list(dicter.values())[0]

'nukucihumun.tk сумм подотчетн лиц организац счет работник отчет расход расчет средств денежн наличн выда учет выдач командировк котор документ авансов сотрудник'

In [275]:
df1.loc[28024] = [int(list(dicter.keys())[4])] + [list(dicter.values())[4]]

In [244]:
for j, k in enumerate(dicter.items()):
    df1.loc[j] = [int(k[0])] + [k[1]]

In [296]:
df2 = pd.read_csv('train_groups.csv')

In [283]:
df1 = df1.rename(columns={'id': 'doc_id'})

In [291]:
df3 = pd.merge(df2, df1, how='left', on='doc_id')

In [293]:
df3.to_csv('test_groups.csv')

In [321]:
df2.columns

Index(['Unnamed: 0', 'pair_id', 'group_id', 'doc_id', 'target', 'words'], dtype='object')

In [344]:
df2 = df2.drop('Unnamed: 0', axis=1)

In [346]:
df2.to_csv('test_groups.csv', index=False)

In [347]:
df2 = pd.read_csv('test_groups.csv')

In [21]:
df_train = pd.read_csv('./best_data/train_groups.csv')

In [22]:
df_test = pd.read_csv('./best_data/test_groups.csv')

In [23]:
featss = [i for i in df_train.columns.values][5:] #in case features are in test/train

In [25]:
df_train.drop(featss,axis=1, inplace=True)

In [26]:
df_test.drop(featss,axis=1, inplace=True)

In [27]:
#df_merged = pd.concat([df_train, df_test])

In [30]:
tf_vect = TfidfVectorizer()

In [31]:
train_mas = tf_vect.fit_transform(df_train.words.values)

In [32]:
#merged_mas = tf_vect.fit_transform(df_merged.words.values)

In [33]:
help_mas = np.empty((df_train.shape[0], 25*5))

In [34]:
#help_mas_merged = np.empty((df_merged.shape[0], 25*5)) # for merged

In [35]:
for i in df_train.groupby('group_id'):
    ind = i[1].index.values
    values = train_mas[ind, :]
    distances = np.sort(pairwise_distances(values, metric='cosine'), axis=1)[:,1:26]
    #distances = np.partition(pairwise_distances(values, metric='cosine'),25, axis=1)[:,:25]
    meds = np.tile(np.median(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    maxs = np.tile(np.max(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    disps = np.tile(np.std(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    means = np.tile(np.mean(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    help_mas[ind, :] = np.concatenate((distances, meds, maxs,disps, means), axis=1)

In [282]:
#for merged
'''
for i in df_merged.groupby('group_id'):
    ind = i[1].index.values
    values = merged_mas[ind, :]
    distances = np.partition(pairwise_distances(values, metric='cosine'),25, axis=1)[:,:25]
    meds = np.tile(np.median(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    maxs = np.tile(np.max(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    disps = np.tile(np.std(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    means = np.tile(np.mean(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    help_mas_merged[ind, :] = np.concatenate((distances, meds, maxs,disps, means), axis=1)
'''

In [283]:
#df_help_merged = pd.DataFrame(help_mas_merged)

In [285]:
#df_help_merged.columns = ['fit' + str(i) for i in df_help_merged.columns.values]

In [36]:
df_help = pd.DataFrame(help_mas)

In [295]:
#df_train = pd.merge(df_train, df_help_merged.loc[df_train.index.values] ,how='left', left_index=True, right_index=True)

In [297]:
#df_test = pd.merge(df_test, df_help_merged.loc[df_test.index.values] ,how='left', left_index=True, right_index=True)

In [299]:
#df_test.to_csv('test_groups1.csv', index=False)
#df_train.to_csv('train_groups1.csv', index=False)

In [37]:
df_help.columns = ['fit' + str(i) for i in df_help.columns.values]

In [38]:
df_train = pd.merge(df_train, df_help ,how='left', left_index=True, right_index=True)

In [39]:
df_train.to_csv('train_groups_.csv', index=False)

In [40]:
test_mas = tf_vect.transform(df_test.words.values)
help_mas1 = np.empty((df_test.shape[0], 25*5))
for i in df_test.groupby('group_id'):
    ind = i[1].index.values
    values = test_mas[ind, :]
    distances = np.sort(pairwise_distances(values, metric='cosine'), axis=1)[:,1:26]
    #distances = np.partition(pairwise_distances(values, metric='cosine'),25, axis=1)[:,:25]
    meds = np.tile(np.median(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    maxs = np.tile(np.max(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    disps = np.tile(np.std(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    means = np.tile(np.mean(distances, axis=0), distances.shape[0]).reshape(distances.shape[0], -1)
    help_mas1[ind, :] = np.concatenate((distances, meds, maxs,disps, means), axis=1)
df_help = pd.DataFrame(help_mas1)
df_help.columns = ['fit' + str(i) for i in df_help.columns.values]
df_test = pd.merge(df_test, df_help ,how='left', left_index=True, right_index=True)

In [41]:
df_test.to_csv('test_groups_.csv', index=False)

In [61]:
df_train = pd.read_csv('train_groups_.csv')
traingroups_titledata = {}
my_feat = [i for i in df_train.columns.values][5:]
for i in range(len(df_train)):
    new_doc = df_train.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    features = new_doc[my_feat]
    title = new_doc['words']
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, features, target))

In [62]:
y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, features, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, feat_j, target_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_train.append(sorted(all_dist, reverse=True)[0:25] + list(features)    )
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11689, 150) (11689,) (11689,)


In [63]:
df_test = pd.read_csv('test_groups_.csv')
testgroups_titledata = {}
for i in range(len(df_test)):
    new_doc = df_test.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    features = new_doc[my_feat]
    title = new_doc['words']
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title, features))

In [64]:
X_test = []
groups_test = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title, features) in enumerate(docs):
        groups_test.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, feat_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_test.append(sorted(all_dist, reverse=True)[0:25] + list(features)   )
X_test = np.array(X_test)
groups_test = np.array(groups_test)
print (X_test.shape, groups_test.shape)

(16627, 150) (16627,)


In [46]:
warnings.filterwarnings("ignore")

In [47]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
kf = StratifiedKFold(n_splits=5, shuffle=True)

In [48]:
#X_train = np.concatenate((X_train, groups_train.reshape((-1,1))), axis=1)

In [49]:
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.1)

In [50]:
kf = StratifiedKFold(n_splits=5, shuffle=True)

def validate_model(estimator, params, validate_param, vals):
    score = 0
    for t in tqdm(vals):
        params[validate_param] = t
        tempScore = 0
        for train_split, test_split in kf.split(X_train, y_train):
            X_tr = X_train[train_split]
            y_tr = y_train[train_split]
            X_tst = X_train[test_split]
            y_tst = y_train[test_split]
            clf = estimator(**params)
            res = np.zeros((X_tst.shape[0],))
            #for i in np.unique(X_tr[:, 150]):
            #    ind = np.where(X_tr[:, 150] == i)
            #    clf.fit(X_tr[ind, :150][0], y_tr[ind])
            #    ind_test = np.where(X_tst[:, 150] == i)
            #    res[ind_test] = clf.predict(X_tst[ind_test, :150][0])[:]
            clf.fit(X_tr, y_tr)
            #res = res.reshape((-1,1))
            res = clf.predict(X_tst)
            tempScore += f1_score(y_tst, res)
        if (tempScore > score):
            score = tempScore
            result = t
    print(validate_param, ' = ', result, ' score= ', score / 5)
    return result

In [53]:
params = {'objective': 'binary:logistic'}
params['base_score'] = validate_model(xgb.XGBClassifier, params, 'base_score', np.linspace(0.0001,0.99999,10))
params['n_estimators'] = validate_model(xgb.XGBClassifier, params, 'n_estimators', 
                                            range(20,100,5))
params['reg_lambda'] = validate_model(xgb.XGBClassifier, params, 'reg_lambda', 
                                            np.linspace(0.0001,0.999,10))
params['scale_pos_weight'] = validate_model(xgb.XGBClassifier, params, 'scale_pos_weight', 
                                            np.linspace(2.0,3.0,10))
params['alpha'] = validate_model(xgb.XGBClassifier, params, 'alpha', 
                                            np.linspace(0.0001,1.0,10))
params['min_child_weight'] = validate_model(xgb.XGBClassifier, params, 'min_child_weight', 
                                            np.linspace(0.0001,1.0,10))
paramsLogistic = params

100%|██████████| 10/10 [01:39<00:00,  9.93s/it]
  0%|          | 0/16 [00:00<?, ?it/s]

base_score  =  0.5555944444444445  score=  0.7945532543681821


100%|██████████| 16/16 [01:51<00:00,  6.94s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

n_estimators  =  30  score=  0.798112826423851


100%|██████████| 10/10 [00:37<00:00,  3.78s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

reg_lambda  =  0.11108888888888889  score=  0.796012397610137


100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

scale_pos_weight  =  2.4444444444444446  score=  0.8049516691405166


100%|██████████| 10/10 [00:37<00:00,  3.78s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

alpha  =  0.4445  score=  0.8038768782035449


100%|██████████| 10/10 [00:39<00:00,  3.95s/it]

min_child_weight  =  0.0001  score=  0.8022773895448502





In [52]:
params = {'booster': 'gbtree'}
params['n_estimators'] = validate_model(xgb.XGBClassifier, params, 'n_estimators', 
                                            range(20,100,5))
params['colsample_bytree'] = validate_model(xgb.XGBClassifier, params, 'colsample_bytree', 
                                            np.linspace(0.0001,0.999,10))
params['scale_pos_weight'] = validate_model(xgb.XGBClassifier, params, 'scale_pos_weight', 
                                            np.linspace(2.0,3.0,10))
params['alpha'] = validate_model(xgb.XGBClassifier, params, 'alpha', 
                                            np.linspace(0.0001,1.0,10))
params['min_child_weight'] = validate_model(xgb.XGBClassifier, params, 'min_child_weight', 
                                            np.linspace(0.0001,1.0,10))
paramsGbTree = params

100%|██████████| 16/16 [01:52<00:00,  7.05s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

n_estimators  =  45  score=  0.8017700095163883


100%|██████████| 10/10 [00:34<00:00,  3.42s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

colsample_bytree  =  0.7770222222222222  score=  0.7979186909992315


100%|██████████| 10/10 [00:44<00:00,  4.46s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

scale_pos_weight  =  2.111111111111111  score=  0.806739480005928


100%|██████████| 10/10 [00:44<00:00,  4.40s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

alpha  =  0.33340000000000003  score=  0.8072554936948716


100%|██████████| 10/10 [00:44<00:00,  4.50s/it]

min_child_weight  =  0.7778  score=  0.8095652673115138





In [54]:
from sklearn.ensemble import BaggingClassifier

In [56]:
for t in [15]:
    tempScore = 0
    for train_split, test_split in kf.split(X_train, y_train):
        X_tr = X_train[train_split]
        y_tr = y_train[train_split]
        X_tst = X_train[test_split]
        y_tst = y_train[test_split]
        clf = xgb.XGBClassifier(**paramsGbTree).fit(X_tr, y_tr)
        clf_bag = BaggingClassifier(base_estimator=clf,
                        n_estimators=t, random_state=0).fit(X_tr, y_tr)
        res = clf.predict(X_tst)
        tempScore += f1_score(y_tst, res)
    print(tempScore / 5)

0.7997837788428273


In [58]:
clf.fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test))

0.8167832167832167

In [60]:
clf1 = xgb.XGBClassifier(**paramsGbTree)
clf = BaggingClassifier(base_estimator=clf1,
                        n_estimators=15, random_state=0).fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test))

0.8148148148148149

In [11]:
clf.fit(X_train, y_train)

XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.22207777777777776, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=0.8889, missing=nan, monotone_constraints='()',
              n_estimators=35, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=9.99999975e-05, reg_lambda=1, scale_pos_weight=2.0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [22]:
paramsGbTree

{'booster': 'gbtree',
 'n_estimators': 35,
 'colsample_bytree': 0.22207777777777776,
 'scale_pos_weight': 2.0,
 'alpha': 0.0001,
 'min_child_weight': 0.8889}

In [65]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [28]:
clf = xgb.XGBClassifier(**paramsGbTree)
clf.fit(X_train, y_train)

XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.22207777777777776, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=0.8889, missing=nan, monotone_constraints='()',
              n_estimators=35, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=9.99999975e-05, reg_lambda=1, scale_pos_weight=2.0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [66]:
clf1 = xgb.XGBClassifier(**paramsGbTree)
clf = BaggingClassifier(base_estimator=clf1,
                        n_estimators=15, random_state=0).fit(X_train, y_train)

In [35]:
X_test.shape

(16627, 150)

In [67]:
predict = clf.predict(X_test)

In [68]:
predict.shape

(16627,)

In [69]:
df_test['target'] = predict

In [70]:
df_train

Unnamed: 0,pair_id,group_id,doc_id,target,words,fit0,fit1,fit2,fit3,fit4,...,fit115,fit116,fit117,fit118,fit119,fit120,fit121,fit122,fit123,fit124
0,1,1,15731,0,automn.ru ваз зам подшипник ступиц нив подшипн...,0.112565,0.128736,0.303646,0.381200,0.515635,...,0.848174,0.852419,0.855621,0.859238,0.862404,0.865251,0.867542,0.870943,0.874038,0.876732
1,2,1,14829,0,tiu.ru ваз опт соч сравн цен куп потребительск...,0.361563,0.367733,0.445630,0.476053,0.587270,...,0.848174,0.852419,0.855621,0.859238,0.862404,0.865251,0.867542,0.870943,0.874038,0.876732
2,3,1,15764,0,drom.ru куп ступиц лад калин трансмисс переход...,0.675107,0.679593,0.680178,0.689588,0.691806,...,0.848174,0.852419,0.855621,0.859238,0.862404,0.865251,0.867542,0.870943,0.874038,0.876732
3,4,1,17669,0,carobka.su классик ваз тольятт дааз вис задн с...,0.723980,0.739090,0.792261,0.815722,0.819343,...,0.848174,0.852419,0.855621,0.859238,0.862404,0.865251,0.867542,0.870943,0.874038,0.876732
4,5,1,14852,0,cartore.ru ступиц нив — зам подшипник сво рук ...,0.459298,0.489671,0.522485,0.530556,0.544479,...,0.848174,0.852419,0.855621,0.859238,0.862404,0.865251,0.867542,0.870943,0.874038,0.876732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11684,11686,129,26672,0,eva.ru ❤★✿★апрелят ❤★✿ врем дат сообщен ответ ...,0.874844,0.903136,0.912564,0.949960,0.958090,...,0.938553,0.942205,0.944848,0.947486,0.949626,0.951807,0.953855,0.955754,0.957495,0.959603
11685,11687,129,25838,0,psychotherapie-ros.at gastebuch http cu url bi...,0.872837,0.889652,0.919874,0.933882,0.937706,...,0.938553,0.942205,0.944848,0.947486,0.949626,0.951807,0.953855,0.955754,0.957495,0.959603
11686,11688,129,25703,0,tumblr.com jizolofej archiv эт котор класс сам...,0.761570,0.800528,0.822553,0.823066,0.834615,...,0.938553,0.942205,0.944848,0.947486,0.949626,0.951807,0.953855,0.955754,0.957495,0.959603
11687,11689,129,27885,0,prodeundi.ru как зовут парн диа шурыгин пуст г...,0.239602,0.908111,0.932815,0.933190,0.938295,...,0.938553,0.942205,0.944848,0.947486,0.949626,0.951807,0.953855,0.955754,0.957495,0.959603


In [71]:
df_test

Unnamed: 0,pair_id,group_id,doc_id,words,fit0,fit1,fit2,fit3,fit4,fit5,...,fit116,fit117,fit118,fit119,fit120,fit121,fit122,fit123,fit124,target
0,11691,130,6710,youtube.com как прописа админк в кс себ ил дру...,0.243496,0.395833,0.426603,0.429909,0.439519,0.472278,...,0.838425,0.842890,0.846824,0.850915,0.854294,0.858193,0.862033,0.865071,0.869393,1
1,11692,130,4030,v-sampe.ru скача sgl rp доработк слив мод mysq...,0.398399,0.536240,0.615161,0.651675,0.722984,0.812070,...,0.838425,0.842890,0.846824,0.850915,0.854294,0.858193,0.862033,0.865071,0.869393,0
2,11693,130,5561,dream-x.ru как прописа админк кс count strik к...,0.474139,0.519859,0.522003,0.540727,0.561512,0.575746,...,0.838425,0.842890,0.846824,0.850915,0.854294,0.858193,0.862033,0.865071,0.869393,1
3,11694,130,4055,net.ru как прописа прост админк кс админк кс п...,0.397117,0.429909,0.475111,0.487070,0.509731,0.533393,...,0.838425,0.842890,0.846824,0.850915,0.854294,0.858193,0.862033,0.865071,0.869393,1
4,11695,130,4247,o3one.ru подбор админ сервер код арх форум ozo...,0.650088,0.691921,0.720278,0.733805,0.762663,0.766120,...,0.838425,0.842890,0.846824,0.850915,0.854294,0.858193,0.862033,0.865071,0.869393,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16622,28313,309,16637,mail.ru ответ mail ru полезн куша творог утр х...,0.567072,0.595642,0.633593,0.639789,0.656616,0.678725,...,0.591068,0.596888,0.602179,0.608353,0.614358,0.620099,0.624341,0.629072,0.635117,1
16623,28314,309,16759,inmoment.ru творог полезн свойств лечен творог...,0.471898,0.538731,0.547366,0.557075,0.562548,0.563435,...,0.591068,0.596888,0.602179,0.608353,0.614358,0.620099,0.624341,0.629072,0.635117,1
16624,28315,309,15358,edaplus.info творог полезн опасн свойств творо...,0.416179,0.437015,0.446254,0.456509,0.458961,0.471898,...,0.591068,0.596888,0.602179,0.608353,0.614358,0.620099,0.624341,0.629072,0.635117,1
16625,28316,309,17287,mail.ru ответ mail ru чем полез творог творог ...,0.130777,0.201235,0.312665,0.335397,0.340588,0.379820,...,0.591068,0.596888,0.602179,0.608353,0.614358,0.620099,0.624341,0.629072,0.635117,1


In [72]:
df_test[['pair_id', 'target']].to_csv('new_prediction.csv', index=False)