In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
from multiprocessing.dummy import Pool as ThreadPool
import time
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics import f1_score

import os

from tqdm import tqdm

import random

from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bulatral42/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [173]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print(len(doc_to_title))

28026


In [174]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))
type(train_data)

pandas.core.frame.DataFrame

In [175]:
def get_topk_inter_dist(texts, k=25):
    X = np.zeros(shape=(len(texts), len(texts)), dtype=float)
    for i in range(len(texts)):
        words_i = set(texts[i].lower().strip().split())
        for j in range(i + 1, len(texts)):
            words_j = set(texts[j].lower().strip().split())
            X[i, j] = X[j, i] = len(words_i & words_j) / (1 + len(words_i | words_j))
    return np.sort(X, axis=1)[:, :-k-1:-1]

In [176]:
def get_topk_tfidf_cosine_dist(texts, ngrams=(1, 1), k=25):
    vectorizer = TfidfVectorizer(ngram_range=ngrams)
    X = vectorizer.fit_transform(texts).toarray()
    return np.sort(pairwise_distances(X, X, metric='cosine'), axis=1)[:, 1:k+1]

In [195]:
def get_topk_d2v_sim(texts, k=25):
    tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) 
                   for i, doc in enumerate(texts)]
    
    model_d2v = Doc2Vec(vector_size=50, alpha=0.025, min_count=2)
    model_d2v.build_vocab(tagged_data)

    for epoch in range(50):
        model_d2v.train(tagged_data,
                        total_examples=model_d2v.corpus_count,
                        epochs=model_d2v.epochs)

    X = np.zeros((len(texts), 50))

    for i in range(X.shape[0]):
        X[i] = model_d2v.docvecs[i]
    
    return np.sort(pairwise_distances(X, X, metric='cosine'), axis=1)[:, 1:k+1]

In [178]:
train_data

Unnamed: 0,pair_id,group_id,doc_id,target
0,1,1,15731,0
1,2,1,14829,0
2,3,1,15764,0
3,4,1,17669,0
4,5,1,14852,0
...,...,...,...,...
11685,11686,129,26672,0
11686,11687,129,25838,0
11687,11688,129,25703,0
11688,11689,129,27885,0


In [179]:
#pair_id, group_id, doc_id, target
def get_features(data, k=25):
    X = np.ndarray(shape=(0, 3 * k))
    y = []
    for gr_id in tqdm(data.group_id.unique()):
        texts = []
        if 'target' in data.columns:

            for _, _, doc_id, target in data[data.group_id == gr_id].values:
                y.append(target)
                with open('data_title_h16/' + str(doc_id) + '.dat.txt', mode='r') as doc:
                    texts.append(doc.read())
        else:
            for _, _, doc_id in data[data.group_id == gr_id].values:
                with open('data_title_h16/' + str(doc_id) + '.dat.txt', mode='r') as doc:
                    texts.append(doc.read())
        feat_gr = np.hstack((get_topk_inter_dist(texts, k=k), 
                             get_topk_tfidf_cosine_dist(texts, k=k), 
                             get_topk_d2v_sim(texts, k=k)))
        X = np.vstack((X, feat_gr))
    if 'target' in data.columns:
        return X, np.asarray(y)
    else:
        return X

In [180]:
shuff_ids = list(train_data.group_id)
random.shuffle(shuff_ids)
train_groups, test_groups = shuff_ids[:110], shuff_ids[110:]

In [181]:
train = train_data.loc[train_data.group_id.isin(train_groups)]

In [182]:
train

Unnamed: 0,pair_id,group_id,doc_id,target
196,197,3,3633,0
197,198,3,1806,0
198,199,3,3259,1
199,200,3,3394,0
200,201,3,3005,0
...,...,...,...,...
11594,11595,128,8583,1
11595,11596,128,10253,1
11596,11597,128,7654,1
11597,11598,128,10836,1


In [183]:
X_train, y_train = get_features(train)
X_val, y_val = get_features(train_data.loc[train_data.group_id.isin(test_groups)])

100%|██████████| 69/69 [07:30<00:00,  6.53s/it]
100%|██████████| 129/129 [12:40<00:00,  5.90s/it]


In [184]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedKFold, KFold

In [185]:
X_train.shape

(6369, 75)

In [186]:
clf = lgb.LGBMClassifier()

In [187]:
clf.fit(X_train, y_train)

LGBMClassifier()

In [188]:
y_pred = clf.predict(X_val)

In [189]:
print('Validation score: {}'.format(f1_score(y_val, y_pred)))

Validation score: 0.7857252730349177


In [191]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print(len(doc_to_title))

28026


In [192]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))
type(train_data)

pandas.core.frame.DataFrame

In [193]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))
test_data.head()

Unnamed: 0,pair_id,group_id,doc_id
0,11691,130,6710
1,11692,130,4030
2,11693,130,5561
3,11694,130,4055
4,11695,130,4247


In [196]:
X_train, y_train = get_features(train_data)
X_val = get_features(test_data)

100%|██████████| 129/129 [05:14<00:00,  2.44s/it]
100%|██████████| 180/180 [07:04<00:00,  2.36s/it]


In [197]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [198]:
y_pred = clf.predict(X_val)

In [199]:
data = {'pair_id': np.asarray(test_data.pair_id), 'target': y_pred}
df = pd.DataFrame(data=data)
df = df.set_index(keys=['pair_id'])
df.to_csv('submitBoostHeaders.csv')
df.head()

Unnamed: 0_level_0,target
pair_id,Unnamed: 1_level_1
11691,1
11692,0
11693,1
11694,1
11695,0


In [200]:
print((y_pred == 1).astype(int).sum())
print((y_pred == 0).astype(int).sum())

print((y_train == 1).astype(int).sum())
print((y_train == 0).astype(int).sum())

4050
12577
3361
8329


скор на лидерборде: 0.69271