In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, make_scorer, pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GroupKFold, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from xgboost import XGBClassifier

In [2]:
doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print(len(doc_to_title))

28026


In [3]:
doc_to_title[0] = ''
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
sorted_doc_to_title = dict(sorted(doc_to_title.items()))
doc_tfidf = vectorizer.fit_transform(sorted_doc_to_title.values())
doc_tfidf.shape

(28027, 38695)

In [4]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))
    
X_train = []
y_train = []
groups_train = []
index = 0
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    y_train.extend([x[2] for x in docs])
    doc_ids = [x[0] for x in docs]
    for j, dist in enumerate(pairwise_distances(doc_tfidf[doc_ids], metric='cosine')):      
        groups_train.append(new_group)
        X_train.append(sorted(dist)[1:11])
        index += 1
        
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
groups_train = np.asarray(groups_train)
print(X_train.shape, y_train.shape, groups_train.shape)

(11690, 10) (11690,) (11690,)


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [6]:
grid_param = {'learning_rate': [0.01, 0.05, 0.1],
              'max_depth': range(1, 5),
              'n_estimators': [10, 50, 100],
              'eval_metric'  : ['error', 'logloss', 'auc']}

xgb = XGBClassifier(use_label_encoder=False, 
                    objective='binary:logistic',
                    seed=42)

gs_xgb = GridSearchCV(xgb, grid_param, scoring=make_scorer(f1_score), 
                      cv=GroupKFold(n_splits=3), verbose=1)

gs_xgb.fit(X_train_scaled, y_train, groups=groups_train)

best_xgb = gs_xgb.best_estimator_
gs_xgb.best_params_

Fitting 3 folds for each of 108 candidates, totalling 324 fits


{'eval_metric': 'error',
 'learning_rate': 0.05,
 'max_depth': 1,
 'n_estimators': 10}

In [7]:
best_xgb = gs_xgb.best_estimator_
score = cross_val_score(best_xgb, X_train_scaled,
                y_train, groups=groups_train,
                scoring=make_scorer(f1_score))
print(score)
score.mean()

[0.70118343 0.71865443 0.73463485 0.67464473 0.78109798]


0.7220430859374584

In [10]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))

X_test = []
groups_test = []
index = 0
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]        
    doc_ids = [x[0] for x in docs]
    for j, dist in enumerate(pairwise_distances(doc_tfidf[doc_ids], metric='cosine')):
        groups_test.append(new_group)
        X_test.append(sorted(dist)[1:11])
        index += 1
X_test = np.asarray(X_test)
groups_test = np.asarray(groups_test)
print(X_test.shape, groups_test.shape)

(16627, 10) (16627,)


In [11]:
best_xgb.fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
y_pred = best_xgb.predict(X_test_scaled)

In [13]:
subm = pd.read_csv('sample_submission.csv')
subm.target = y_pred
subm.to_csv('submission.csv', index=False)