In [18]:
import os
import pandas as pd
import numpy as np
from enum import Enum
import itertools
import random
from sklearn.model_selection import train_test_split

In [3]:
import nltk
nltk.download('punkt', download_dir='/run/media/root/Windows/Users/agnes/tmp')

[nltk_data] Downloading package punkt to
[nltk_data]     /run/media/root/Windows/Users/agnes/tmp...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# split train(+dev)/test:

# test is same format as here
# train(+dev) is: 

# 1
# - only "original" documents
# - split the 'doc' content, in sentences
# - all sentences from positive doc, will be labeled as positive
# - all sentences from negative doc, will be labeled as negative
# (optionally) duplicate positive examples so that there are enough

# 2 (for entire doc-encoding)
# - only "original" documents
# (optionally) duplicate positive examples so that there are enough

# output type can be as follows:
# normal:   qid, query, rel, doc
# triplet:  qid, query, doc_negative, doc_positive


In [5]:
class DocType(Enum):
    sentence = 0
    entire_doc = 1  

class LossType(Enum):
    regression = 0
    triplet = 1
    

In [6]:
def get_combis_sentences(qid, query_sentences, docs_sentences, rel_label):
    new_query_df = pd.DataFrame(columns=['qid', 'sent_1', 'rel', 'sent_2'])
    for k in itertools.product(query_sentences, docs_sentences):
        sent_1 = k[0]
        for sent_2 in k[1]:
            new_query_df = new_query_df.append({'qid': qid, 'sent_1': sent_1, 
                                               'sent_2': sent_2, 'rel': rel_label}, 
                                               ignore_index=True)
    return new_query_df


def get_combis_docs(qid, query, docs, rel_label):
    new_query_df = pd.DataFrame(columns=['qid', 'sent_1', 'rel', 'sent_2'])
    for doc in docs:
        new_query_df = new_query_df.append({'qid': qid, 'sent_1': query, 
                                            'sent_2': doc, 'rel': rel_label}, 
                                           ignore_index=True)    
    return new_query_df


def make_triplets(qid, query, positives, negatives, max_triplets=50):
    
    new_query_df = pd.DataFrame(columns=['qid', 'query', 'doc_positive', 'doc_negative'])
    
    combinations = list(itertools.product(positives, negatives))
    random.shuffle(combinations)
    for i, k in enumerate(combinations):
        new_query_df = new_query_df.append({'qid': qid, 'query': query, 
                                            'doc_positive': k[0], 
                                            'doc_negative': k[1]}, 
                                            ignore_index=True)    
        if i >= max_triplets:
            break
            
    return new_query_df


def make_train_data(queries_df, rel_label=1, adjust_sample_bias=True,
                    doc_type=DocType.sentence, limit=None, shuffle=False,
                    loss_type=LossType.regression,
                    sample_factor=1,
                    max_triplets_per_query=50):
    
    all_qids = list(sorted(set(queries_df['qid'])))
    
    queries_df = queries_df.loc[queries_df['type'] == 'original']
    
    if loss_type == LossType.regression:
        new_df = pd.DataFrame(columns=['qid', 'sent_1', 'rel', 'sent_2'])
    elif loss_type == LossType.triplet:
        new_df = pd.DataFrame(columns=['qid', 'query', 'doc_positive', 'doc_negative'])
    else:
        raise Exception('unknown LossType: ', loss_type)
        
    for i, qid in enumerate(all_qids):        
        qid_df = queries_df[queries_df['qid'] == qid].reset_index(drop=True)
        query = qid_df.iloc[0]["query"]
        
        docs_rels = qid_df['rel'].values
        rel_docs_mask = docs_rels == rel_label
        
        if doc_type == DocType.sentence:        
            query_sentences = nltk.tokenize.sent_tokenize(query)      
            docs_sentences = qid_df['doc'].apply(nltk.tokenize.sent_tokenize).values
            relevant_combis = get_combis_sentences(qid, query_sentences, 
                                                   docs_sentences[rel_docs_mask], rel_label)
            irrelevant_combis = get_combis_sentences(qid, query_sentences, 
                                                     docs_sentences[~rel_docs_mask], 0)            
        elif doc_type == DocType.entire_doc:
            relevant_combis = get_combis_docs(qid, query,  qid_df['doc'][rel_docs_mask], rel_label)
            irrelevant_combis = get_combis_docs(qid, query,  qid_df['doc'][~rel_docs_mask], 0)
        else:
            raise Exception('unknown DocType: ', doc_type)
        
        # ensure as many positive as negative labels
        if adjust_sample_bias:
            scale_factor = int(np.around(len(irrelevant_combis) / len(relevant_combis)))
            relevant_combis = pd.concat([relevant_combis] * scale_factor)
        
        data_to_add = [relevant_combis, irrelevant_combis]
        # make triplets
        if loss_type == LossType.triplet:
            data_to_add = make_triplets(qid, query, relevant_combis['sent_2'], 
                                                    irrelevant_combis['sent_2'],
                                                    max_triplets=max_triplets_per_query)
        
        if sample_factor < 1:
            data_to_add = data_to_add.sample(sample_factor)
        new_df = new_df.append(data_to_add)
        
        if limit is not None and (i+1) >= limit:
            break
        if (i+1) % 50 == 0:
            print('processed query {:d} of {:d}'.format(i+1, len(all_qids)))
            
    if shuffle:
        return new_df.sample(frac=1)
    return new_df


In [65]:
data_dir = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco'

fn = os.path.join(data_dir, 'queries3_od.csv')

df = pd.read_csv(fn)

df.head(3)

Unnamed: 0,qid,query,rel,type,doc
0,0,aaa a common cause of a skid is,0,original,"6. When diamonds are what you want, only the f..."
1,0,aaa a common cause of a skid is,0,original,There are probably 50-80 grams of sugar in a c...
2,0,aaa a common cause of a skid is,0,original,price ceiling: A legally determined maximum pr...


In [66]:
TRAIN_DATA_DIR = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_data'

In [70]:
ofn_prefix = os.path.splitext(os.path.basename(fn))[0]
ofn_prefix

'queries3_od'

In [71]:
train_data_entiredoc_regression = make_train_data(df, doc_type=DocType.entire_doc, 
                                                  loss_type=LossType.regression)

train_data_entiredoc_regression.to_csv(os.path.join(TRAIN_DATA_DIR, 
                                                    f'{ofn_prefix}_entiredoc_regression.csv'),
                                                    index=None)

processed query 50 of 20117
processed query 100 of 20117
processed query 150 of 20117
processed query 200 of 20117
processed query 250 of 20117
processed query 300 of 20117
processed query 350 of 20117
processed query 400 of 20117
processed query 450 of 20117
processed query 500 of 20117
processed query 550 of 20117
processed query 600 of 20117
processed query 650 of 20117
processed query 700 of 20117
processed query 750 of 20117
processed query 800 of 20117
processed query 850 of 20117
processed query 900 of 20117
processed query 950 of 20117
processed query 1000 of 20117
processed query 1050 of 20117
processed query 1100 of 20117
processed query 1150 of 20117
processed query 1200 of 20117
processed query 1250 of 20117
processed query 1300 of 20117
processed query 1350 of 20117
processed query 1400 of 20117
processed query 1450 of 20117
processed query 1500 of 20117
processed query 1550 of 20117
processed query 1600 of 20117
processed query 1650 of 20117
processed query 1700 of 20117


processed query 13650 of 20117
processed query 13700 of 20117
processed query 13750 of 20117
processed query 13800 of 20117
processed query 13850 of 20117
processed query 13900 of 20117
processed query 13950 of 20117
processed query 14000 of 20117
processed query 14050 of 20117
processed query 14100 of 20117
processed query 14150 of 20117
processed query 14200 of 20117
processed query 14250 of 20117
processed query 14300 of 20117
processed query 14350 of 20117
processed query 14400 of 20117
processed query 14450 of 20117
processed query 14500 of 20117
processed query 14550 of 20117
processed query 14600 of 20117
processed query 14650 of 20117
processed query 14700 of 20117
processed query 14750 of 20117
processed query 14800 of 20117
processed query 14850 of 20117
processed query 14900 of 20117
processed query 14950 of 20117
processed query 15000 of 20117
processed query 15050 of 20117
processed query 15100 of 20117
processed query 15150 of 20117
processed query 15200 of 20117
processe

In [None]:
train_data_sentences_regression = make_train_data(df, doc_type=DocType.sentence, 
                                                  loss_type=LossType.regression)

train_data_sentences_regression.to_csv(os.path.join(TRAIN_DATA_DIR, 
                                                    f'{ofn_prefix}_sentences_regression.csv'),
                                                     index=None)

processed query 50 of 20117
processed query 100 of 20117
processed query 150 of 20117
processed query 200 of 20117
processed query 250 of 20117
processed query 300 of 20117
processed query 350 of 20117
processed query 400 of 20117
processed query 450 of 20117
processed query 500 of 20117
processed query 550 of 20117
processed query 600 of 20117
processed query 650 of 20117
processed query 700 of 20117
processed query 750 of 20117
processed query 800 of 20117
processed query 850 of 20117
processed query 900 of 20117
processed query 950 of 20117
processed query 1000 of 20117
processed query 1050 of 20117
processed query 1100 of 20117
processed query 1150 of 20117
processed query 1200 of 20117
processed query 1250 of 20117
processed query 1300 of 20117
processed query 1350 of 20117
processed query 1400 of 20117
processed query 1450 of 20117
processed query 1500 of 20117
processed query 1550 of 20117
processed query 1600 of 20117
processed query 1650 of 20117
processed query 1700 of 20117


In [None]:
train_data_entiredoc_triplet = make_train_data(df, doc_type=DocType.entire_doc, 
                                                   loss_type=LossType.triplet,
                                                   max_triplets_per_query=30)

train_data_entiredoc_triplet.to_csv(os.path.join(TRAIN_DATA_DIR, 
                                                 f'{ofn_prefix}_entiredoc_triplet.csv'), 
                                                 index=None)

In [None]:
train_data_sentences_triplet = make_train_data(df, doc_type=DocType.sentence, 
                                                   loss_type=LossType.triplet,
                                                   max_triplets_per_query=30)

train_data_sentences_triplet.to_csv(os.path.join(TRAIN_DATA_DIR, 
                                                 f'{ofn_prefix}_sentences_triplet.csv'), 
                                                 index=None)

In [53]:
# split data into train, dev and test sets
##########################################

all_qids = list(set(df['qid']))

qids_train, qids_test = train_test_split(all_qids, test_size=400, random_state=1234)

qids_train, qids_dev = train_test_split(qids_train, test_size=150, random_state=1234)

print('nr train: {:d}, nr. dev: {:d}, nr. test: {:d}'.format(
    len(qids_train), len(qids_dev), len(qids_test)))


nr train: 19567, nr. dev: 150, nr. test: 400


In [54]:
split_df = pd.DataFrame(columns=['qid', 'type'])
split_df['qid'] = qids_train + qids_dev + qids_test
split_df['type'] = ['train']*len(qids_train) + ['dev']*len(qids_dev) + ['test']*len(qids_test)

split_ofname = os.path.join(TRAIN_DATA_DIR, f'{ofn_prefix}_split.csv')

split_df.to_csv(split_ofname, index=None)

print('saved train/dev/test split to "{:s}"'.format(split_ofname))

saved train/dev/test split to "/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_data/queries3_od_split.csv"


In [1]:
31*1000

31000

In [3]:
31000 / 488047 * 100

6.351847260612195