In [58]:
# data science
import math
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
# utils, tokenization and preprocessing
from gensim import utils
from random import shuffle
import pymorphy2
from scipy.special import expit
from scipy.stats import logistic
from sklearn.metrics import roc_auc_score
import codecs
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [2]:
train_df = pd.read_csv("data/train_task1_latest.csv", encoding='utf-8')
test_df = pd.read_csv("data/sdsj_A_test.csv", encoding='utf-8')

In [3]:
train_df.head()

Unnamed: 0,paragraph_id,question_id,paragraph,question,target
0,1094,46273,"В отличие от рыб, земноводные (амфибии) и прес...",С какого года Русское Царство перешло на летои...,0.0
1,7414,19164,В 1049 году Балдуину V удалось отнять у Герман...,Кто упомянул о его первых разногласиях со Штей...,0.0
2,6744,39767,Стремление достичь предельных значений ёмкости...,Как называется имеющая мировое значение эпоха ...,0.0
3,7300,36318,Первый практически пригодный двухтактный газов...,Что усугублялось из-за международного давления...,0.0
4,7077,41534,Требуя от художника углубленного изучения изоб...,Какой характер носят пророчества Леонардо да В...,0.0


## Data Preprocessing

In [4]:
morph = pymorphy2.MorphAnalyzer()
cores = multiprocessing.cpu_count()
num_partitions = 3 * cores

In [70]:
# Probably it is better not to tokenize questions
def tokenize(string):
    tokens = utils.simple_preprocess(string)
    result = []
    for token in tokens:
        s = morph.parse(token)[0].normal_form
        if len(s) > 1:
            result.append(s)
    return result

def parallelize_df(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def tokenize_df(df):
    columns = df.columns
    for col in columns:
        if not col.endswith("id"):
            df["tokens"] = df[col].apply(tokenize)
    return df

def tokenize_sent_df(df):
    columns = df.columns
    for col in columns:
        if not col.endswith("id"):
            df["tokens"] = df[col].apply(tokenize)
            df["sent_tokens"] = df[col].apply(lambda x: [tokenize(s) for s in tokenizer.tokenize(x)])
    return df


In [71]:
# Read in trainings corpus
plain_file = "data/russian.plain"
text = codecs.open(plain_file, "Ur", "utf-8").read()

# Train tokenizer
tokenizer = PunktSentenceTokenizer()
tokenizer.train(text)

<nltk.tokenize.punkt.PunktParameters at 0x7f6644b567b8>

In [72]:
train_df_p = train_df[["paragraph_id", "paragraph"]].drop_duplicates(subset="paragraph_id")
train_df_q = train_df[["question_id", "question"]].drop_duplicates(subset="question_id")
test_df_p = test_df[["paragraph_id", "paragraph"]].drop_duplicates(subset="paragraph_id")
test_df_q = test_df[["question_id", "question"]].drop_duplicates(subset="question_id")

In [73]:
%time train_df_p = parallelize_df(train_df_p, tokenize_sent_df)
%time train_df_q = parallelize_df(train_df_q, tokenize_df)
%time test_df_p = parallelize_df(test_df_p, tokenize_sent_df)
%time test_df_q = parallelize_df(test_df_q, tokenize_df)

CPU times: user 3.01 s, sys: 2.11 s, total: 5.12 s
Wall time: 29.2 s
CPU times: user 190 ms, sys: 330 ms, total: 520 ms
Wall time: 5.2 s
CPU times: user 300 ms, sys: 290 ms, total: 590 ms
Wall time: 5.51 s
CPU times: user 410 ms, sys: 360 ms, total: 770 ms
Wall time: 9.59 s


In [8]:
train_corpus_p = [TaggedDocument(row["tokens"], ["train_p_" + str(row["paragraph_id"])]) for i, row in train_df_p.iterrows()]
train_corpus_q = [TaggedDocument(row["tokens"], ["train_q_" + str(row["question_id"])]) for i, row in train_df_q.iterrows()]
test_corpus_p = [Taggeds_corpusDocument(row["tokens"], ["test_p_" + str(row["paragraph_id"])]) for i, row in test_df_p.iterrows()]
test_corpus_q = [TaggedDocument(row["tokens"], ["test_q_" + str(row["question_id"])]) for i, row in test_df_q.iterrows()]

In [106]:
s_corpus = [TaggedDocument(s, ["s_" + str(id(s))]) for p in train_df_p["sent_tokens"].tolist() + test_df_p["sent_tokens"].tolist() for s in p]

In [109]:
corpus = train_corpus_p + train_corpus_q + test_corpus_p + test_corpus_q + s_corpus
shuffle(corpus)

## Doc2Vec

In [110]:
doc2vec = Doc2Vec(dm=0, dbow_words=1, size=300, window=150, min_count=5, iter=50, workers=cores) 
%time doc2vec.build_vocab(corpus) 

CPU times: user 9.4 s, sys: 850 ms, total: 10.2 s
Wall time: 9.93 s


In [123]:
%time doc2vec.train(corpus, total_examples=doc2vec.corpus_count, epochs=10)

CPU times: user 41min 49s, sys: 44.7 s, total: 42min 34s
Wall time: 11min


26473864

In [228]:
# doc2vec.save("data/doc2vec")
# doc2vec = Doc2Vec.load("data/doc2vec")

## Testing

In [12]:
# dict for test dataset
id2tokens = {}

for i, row in test_df_p.iterrows():
    _id = "p-" + str(row["paragraph_id"])
    id2tokens[_id] = row["tokens"]

for i, row in test_df_q.iterrows():
    _id = "q-" + str(row["question_id"])
    id2tokens[_id] = row["tokens"]

In [13]:
def label_test_df(df):
    df["prediction"] =  df.apply(lambda row: doc2vec.docvecs.similarity("test_p_" + str(row["paragraph_id"]), "test_q_" + str(row["question_id"])),axis=1)
    return df

def label_train_df(df):
    df["prediction"] =  df.apply(lambda row: doc2vec.docvecs.similarity("train_p_" + str(row["paragraph_id"]), "train_q_" + str(row["question_id"])),axis=1)
    return df

In [124]:
%time train_df = parallelize_df(train_df, label_train_df)

CPU times: user 4.87 s, sys: 5.09 s, total: 9.96 s
Wall time: 11.5 s


In [125]:
train_df.head()

Unnamed: 0,paragraph_id,question_id,paragraph,question,target,prediction
0,1094,46273,"В отличие от рыб, земноводные (амфибии) и прес...",С какого года Русское Царство перешло на летои...,0.0,0.215388
1,7414,19164,В 1049 году Балдуину V удалось отнять у Герман...,Кто упомянул о его первых разногласиях со Штей...,0.0,0.228882
2,6744,39767,Стремление достичь предельных значений ёмкости...,Как называется имеющая мировое значение эпоха ...,0.0,0.11111
3,7300,36318,Первый практически пригодный двухтактный газов...,Что усугублялось из-за международного давления...,0.0,0.276297
4,7077,41534,Требуя от художника углубленного изучения изоб...,Какой характер носят пророчества Леонардо да В...,0.0,0.54173


In [127]:
roc_auc_score(train_df["target"].tolist(), train_df["prediction"].tolist())

0.96583871011112299

In [118]:
roc_auc_score(train_df["target"].tolist(), train_df["prediction"].tolist())

0.96460445215589641

In [198]:
%time test_df = parallelize_df(test_df, label_test_df)

CPU times: user 2.2 s, sys: 1.02 s, total: 3.22 s
Wall time: 23.5 s


In [None]:
test_df.head()

In [225]:
test_df[['paragraph_id', 'question_id', 'prediction']].to_csv("data/prediction.csv", index=False)