In [1]:
import os
import pandas as pd
import numpy as np
from enum import Enum
import itertools

In [2]:
import nltk
nltk.download('punkt', download_dir='/run/media/root/Windows/Users/agnes/tmp')

[nltk_data] Downloading package punkt to
[nltk_data]     /run/media/root/Windows/Users/agnes/tmp...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_dir = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco'

In [4]:
# split train(+dev)/test:

# test is same format as here
# train(+dev) is: 

# 1
# - only "original" documents
# - split the 'doc' content, in sentences
# - all sentences from positive doc, will be labeled as positive
# - all sentences from negative doc, will be labeled as negative
# (optionally) duplicate positive examples so that there are enough

# 2 (for entire doc-encoding)
# - only "original" documents
# (optionally) duplicate positive examples so that there are enough

# output type can be as follows:
# normal:   qid, query, rel, doc
# triplet:  qid, query, doc_negative, doc_positive


In [5]:
class DocType(Enum):
    sentence = 0
    entire_doc = 1  

class LossType(Enum):
    regression = 0
    triplet = 1
    

In [6]:
def get_combis_sentences(qid, query_sentences, docs_sentences, rel_label):
    new_query_df = pd.DataFrame(columns=['qid', 'sent_1', 'rel', 'sent_2'])
    for k in itertools.product(query_sentences, docs_sentences):
        sent_1 = k[0]
        for sent_2 in k[1]:
            new_query_df = new_query_df.append({'qid': qid, 'sent_1': sent_1, 
                                               'sent_2': sent_2, 'rel': rel_label}, 
                                               ignore_index=True)
    return new_query_df


def get_combis_docs(qid, query, docs, rel_label):
    new_query_df = pd.DataFrame(columns=['qid', 'sent_1', 'rel', 'sent_2'])
    for doc in docs:
        new_query_df = new_query_df.append({'qid': qid, 'sent_1': query, 
                                            'sent_2': doc, 'rel': rel_label}, 
                                           ignore_index=True)    
    return new_query_df


def make_triplets(qid, query, positives, negatives):
    
    new_query_df = pd.DataFrame(columns=['qid', 'query', 'doc_positive', 'doc_negative'])
    
    for k in itertools.product(positives, negatives):
        new_query_df = new_query_df.append({'qid': qid, 'query': query, 
                                            'doc_positive': k[0], 
                                            'doc_negative': k[1]}, 
                                            ignore_index=True)    
    return new_query_df


def make_train_data(queries_df, rel_label=1, adjust_sample_bias=True,
                    doc_type=DocType.sentence, limit=None, shuffle=False,
                    loss_type=LossType.regression):
    
    all_qids = list(sorted(set(queries_df['qid'])))
    
    queries_df = queries_df.loc[queries_df['type'] == 'original']
    
    if loss_type == LossType.regression:
        new_df = pd.DataFrame(columns=['qid', 'sent_1', 'rel', 'sent_2'])
    elif loss_type == LossType.triplet:
        new_df = pd.DataFrame(columns=['qid', 'query', 'doc_positive', 'doc_negative'])
    else:
        raise Exception('unknown LossType: ', loss_type)
        
    for i, qid in enumerate(all_qids):        
        qid_df = queries_df[queries_df['qid'] == qid].reset_index(drop=True)
        query = qid_df.iloc[0]["query"]
        
        docs_rels = qid_df['rel'].values
        rel_docs_mask = docs_rels == rel_label
        
        if doc_type == DocType.sentence:        
            query_sentences = nltk.tokenize.sent_tokenize(query)      
            docs_sentences = qid_df['doc'].apply(nltk.tokenize.sent_tokenize).values
            relevant_combis = get_combis_sentences(qid, query_sentences, 
                                                   docs_sentences[rel_docs_mask], rel_label)
            irrelevant_combis = get_combis_sentences(qid, query_sentences, 
                                                     docs_sentences[~rel_docs_mask], 0)            
        elif doc_type == DocType.entire_doc:
            relevant_combis = get_combis_docs(qid, query,  qid_df['doc'][rel_docs_mask], rel_label)
            irrelevant_combis = get_combis_docs(qid, query,  qid_df['doc'][~rel_docs_mask], 0)
        else:
            raise Exception('unknown DocType: ', doc_type)
        
        # ensure as many positive as negative labels
        if adjust_sample_bias:
            scale_factor = int(np.around(len(irrelevant_combis) / len(relevant_combis)))
            relevant_combis = pd.concat([relevant_combis] * scale_factor)
        
        data_to_add = [relevant_combis, irrelevant_combis]
        if loss_type == LossType.triplet:
            data_to_add = make_triplets(qid, query, relevant_combis['sent_2'], 
                                                    irrelevant_combis['sent_2'])
            
        new_df = new_df.append(data_to_add)
        
        if limit is not None and (i+1) >= limit:
            break
        if (i+1) % 50 == 0:
            print('processed query {:d} of {:d}'.format(i+1, len(all_qids)))
            
    if shuffle:
        return new_df.sample(frac=1)
    return new_df


In [7]:
fn = os.path.join(data_dir, 'queries_od.csv')

df = pd.read_csv(fn)

df.head(3)

Unnamed: 0,qid,query,rel,type,doc
0,0,are cnn ratings falling,0,original,Mustard greens are also a good food choice for...
1,0,are cnn ratings falling,0,original,The only concessions Jay obtained was a surren...
2,0,are cnn ratings falling,0,original,Allen: Constitution Prevails Over President's ...


In [8]:
TRAIN_DATA_DIR = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_data'

In [10]:
train_data_entiredoc_regression = make_train_data(df, doc_type=DocType.entire_doc, 
                                                  loss_type=LossType.regression)

train_data_entiredoc_regression.to_csv(os.path.join(TRAIN_DATA_DIR, 'queries_od_entiredoc_regression.csv'),
                                       index=None)

processed query 50 of 308
processed query 100 of 308
processed query 150 of 308
processed query 200 of 308
processed query 250 of 308
processed query 300 of 308


In [11]:
train_data_sentences_regression = make_train_data(df, doc_type=DocType.sentence, 
                                                  loss_type=LossType.regression)

train_data_sentences_regression.to_csv(os.path.join(TRAIN_DATA_DIR, 'queries_od_sentences_regression.csv'),
                                       index=None)

processed query 50 of 308
processed query 100 of 308
processed query 150 of 308
processed query 200 of 308
processed query 250 of 308
processed query 300 of 308


In [None]:
train_data_entiredoc_triplet = make_train_data(df, doc_type=DocType.entire_doc, 
                                                   loss_type=LossType.triplet)

train_data_entiredoc_triplet.to_csv(os.path.join(TRAIN_DATA_DIR, 'queries_od_entiredoc_triplet.csv'), index=None)

In [None]:
train_data_sentences_triplet = make_train_data(df, doc_type=DocType.sentence, 
                                                   loss_type=LossType.triplet)

train_data_sentences_triplet.to_csv(os.path.join(TRAIN_DATA_DIR, 'queries_od_sentences_triplet.csv'), index=None)

In [69]:
train_data_sentence_pairs = make_train_data(df, doc_type=DocType.sentence)

processed query 50 of 308
processed query 100 of 308
processed query 150 of 308
processed query 200 of 308
processed query 250 of 308
processed query 300 of 308
