In [1]:
import os

In [2]:
from enum import Enum

In [3]:
import pandas as pd
import numpy as np

In [4]:
import nltk

nltk.download('punkt', download_dir='/run/media/root/Windows/Users/agnes/tmp')

[nltk_data] Downloading package punkt to
[nltk_data]     /run/media/root/Windows/Users/agnes/tmp...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
import random

In [6]:
'''
We're going to make:

output df:
query, document, relevance, type  


type can be:
   original    
   degree_2
   degree_3
   degree_4
   degree_4_split  
   
source of irrelevant docs can be:
    query_docs
    otherquery_docs

for now, we use query_docs.
'''

"\nWe're going to make:\n\noutput df:\nquery, document, relevance, type  \n\n\ntype can be:\n   original    \n   degree_2\n   degree_3\n   degree_4\n   degree_4_split  \n   \nsource of irrelevant docs can be:\n    query_docs\n    otherquery_docs\n\nfor now, we use query_docs.\n"

In [7]:
def preprocess_text(txt):
    '''
    Correct common misencoded utf-8 characters
    ''' 
    txt = txt.replace("â\x80\x99", "'")
    txt = txt.replace('â\x80\x98', "'")
    txt = txt.replace("â\x80\x94", "-")
    txt = txt.replace('â\x80\x93', '-')
    txt = txt.replace('â\x80\x9C', '\'')
    txt = txt.replace('â\x80\x9D', '\'')
    return txt

In [8]:
class LazyMsMarcoReader():

    def __init__(self, directory, batch_size=10):
        self.dir = directory
        self.batch_size = batch_size
        self.sd_object = os.scandir(directory)
        self.counter = 0
        
    def get_batch(self):
        dfs = []
        i = 0
        for f in self.sd_object:
            if not f.is_file():
                continue
            df = pd.read_csv(os.path.join(self.dir, f.name), sep='\t', 
                             header=None, names=['query', 'rel', 'irrel'])
            df = df.dropna()
            df = df.applymap(preprocess_text)
            dfs.append(df)
            i += 1
            self.counter += 1
            if i >= self.batch_size:
                final_df = pd.concat(dfs)
                dfs = []
                i = 0
                yield final_df

In [9]:
def generate_degree_n_docs(rel_doc, irrel_docs, n, amount_to_generate=3):
    if n > len(irrel_docs) or n < 2:
        return False
    
    docs_list = []
    for _ in range(amount_to_generate):
        doc_list = [rel_doc]
        random_nums = random.sample(range(len(irrel_docs)-1), n-1)
        #print(len(random_nums))
        for random_num in random_nums:    
            doc_list.append(irrel_docs[random_num])
        random.shuffle(doc_list)
        docs_list.append(' '.join(doc_list))
    return docs_list


In [10]:
def generate_degree_n_docs_split(rel_doc, irrel_docs, n, amount_to_generate=3):    
    if n > len(irrel_docs) or n < 2:
        return False
    
    sent_rel_split = nltk.tokenize.sent_tokenize(rel_doc)
    if len(sent_rel_split) < 2:
        return False
    
    split_point = int(len(sent_rel_split)/2)
    sent_rel_pt1, sent_rel_pt2 = ' '.join(sent_rel_split[:split_point]), \
                                 ' '.join(sent_rel_split[split_point:])

    docs_list = []
    for _ in range(amount_to_generate):
        doc_list = [''] * (n+1)
        index_pt_1, index_pt_2 = sorted(random.sample(range(0, n+1, 2), 2))
        doc_list[index_pt_1] = sent_rel_pt1
        doc_list[index_pt_2] = sent_rel_pt2
        
        other_indices = list(set(range(len(doc_list))).difference([index_pt_1, index_pt_2]))
        random.shuffle(other_indices)

        random_nums = random.sample(range(len(irrel_docs)-1), min(len(irrel_docs)-1, n-1))
        for i, random_num in enumerate(random_nums):    
            random_doc = irrel_docs[random_num]
            doc_list[other_indices[i]] = random_doc
            
        docs_list.append(' '.join(doc_list))
    return docs_list

In [11]:
def make_extra_data_for_query(qid, query, doc_rel, docs_irrel, 
                              nr_per_degree=3,
                              columns=['qid', 'query', 'rel', 'type', 'doc']):
    
    degree_2 = generate_degree_n_docs(doc_rel, docs_irrel, 2, amount_to_generate=nr_per_degree)      
    degree_3 = generate_degree_n_docs(doc_rel, docs_irrel, 3, amount_to_generate=nr_per_degree)
    degree_4 = generate_degree_n_docs(doc_rel, docs_irrel, 4, amount_to_generate=nr_per_degree)
    degree_4_split = generate_degree_n_docs_split(doc_rel, docs_irrel, 4, amount_to_generate=nr_per_degree)  
    degree_8 = generate_degree_n_docs(doc_rel, docs_irrel, 8, amount_to_generate=nr_per_degree)
    degree_8_split = generate_degree_n_docs_split(doc_rel, docs_irrel, 8, amount_to_generate=nr_per_degree)  

    if not all([degree_2, degree_3, degree_4, degree_4_split, degree_8, degree_8_split]):
        return pd.DataFrame()

    rels = [0] * len(docs_irrel) + \
           [1] + \
           [1] * nr_per_degree * 6

    docs = docs_irrel + \
           [doc_rel] + \
           degree_2 + degree_3 + degree_4 + degree_4_split + degree_8 + degree_8_split

    types = ['original'] * len(docs_irrel) + \
            ['original'] + \
            ['degree_2'] * nr_per_degree + \
            ['degree_3'] * nr_per_degree + \
            ['degree_4'] * nr_per_degree + \
            ['degree_4_split'] * nr_per_degree + \
            ['degree_8'] * nr_per_degree + \
            ['degree_8_split'] * nr_per_degree

    combined = list(zip([qid] * len(types), [query] * len(types), rels, types, docs))
    
    return pd.DataFrame(combined, columns=columns)

In [12]:
class IrrelevantSource(Enum):
    same_query = 0
    other_query = 1  

In [13]:
def generate_extended_data(input_data_dir,
                           max_amount_queries=None,
                           reader_batch_size=10, 
                           nr_orig_irrel_docs=15,
                           nr_per_degree=4,
                           irrelevant_source=IrrelevantSource.same_query):
    
    
    folder_reader = LazyMsMarcoReader(input_data_dir, batch_size=reader_batch_size)

    output_df = pd.DataFrame(columns=['qid', 'query', 'rel', 'type', 'doc'])
    qid_counter = 0

    for df in folder_reader.get_batch():
        queries = list(set(df['query']))
        
        for query in queries:                     
            query_df = df[df['query'] ==  query]
            not_query_df = df[df['query'] !=  query]
            doc_rel = query_df['rel'].iloc[0]
            
            #print('query_df:', query_df)
            if irrelevant_source == IrrelevantSource.same_query:  
                docs_irrel = query_df['irrel'].values.tolist()
            else:
                docs_irrel = not_query_df['irrel'].values.tolist()            
            
            if len(docs_irrel) < nr_orig_irrel_docs:
                #print('skipping query for lack of irrelevant docs')
                continue                
            docs_irrel = random.sample(docs_irrel, nr_orig_irrel_docs)      
            
            extra_data_for_query = make_extra_data_for_query(qid_counter, query, doc_rel, docs_irrel,
                                                             nr_per_degree=nr_per_degree)
            
            if len(extra_data_for_query) == 0:
                continue                
            output_df = output_df.append(extra_data_for_query)

            qid_counter += 1
            if max_amount_queries is not None and qid_counter >= max_amount_queries:
                return output_df
            if qid_counter % 100 == 0:
                print('Processed {:d} queries of maximum {:}.'.format(qid_counter, max_amount_queries))
    print('Done. Processed {:d} queries.'.format(qid_counter))
    return output_df 

In [18]:
data_dir = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/queries4'

df = generate_extended_data(data_dir, 
                            irrelevant_source=IrrelevantSource.same_query,
                            nr_orig_irrel_docs=8,
                            reader_batch_size=10)

output_file_path = os.path.join(data_dir, '../queries4.csv')
df.to_csv(output_file_path, index=None)
print('wrote to "{:s}"'.format(output_file_path))

Processed 100 queries of maximum None.
Processed 200 queries of maximum None.
Processed 300 queries of maximum None.
Processed 400 queries of maximum None.
Processed 500 queries of maximum None.
Processed 600 queries of maximum None.
Processed 700 queries of maximum None.
Processed 800 queries of maximum None.
Processed 900 queries of maximum None.
Processed 1000 queries of maximum None.
Processed 1100 queries of maximum None.
Processed 1200 queries of maximum None.
Processed 1300 queries of maximum None.
Processed 1400 queries of maximum None.
Processed 1500 queries of maximum None.
Processed 1600 queries of maximum None.
Processed 1700 queries of maximum None.
Processed 1800 queries of maximum None.
Processed 1900 queries of maximum None.
Processed 2000 queries of maximum None.
Processed 2100 queries of maximum None.
Processed 2200 queries of maximum None.
Processed 2300 queries of maximum None.
Processed 2400 queries of maximum None.
Processed 2500 queries of maximum None.
Processed

In [15]:
# sanity check

test_df = df[df['qid'] < 100]

all_types = list(sorted(set(test_df['type'])))

for ttype in all_types:    
    subset = test_df[test_df['type'] == ttype]    
    docs = subset['doc'].values
    print('type: {:s}\tavg. nr. words: {:2.2f}, avg nr. sentences: {:2.2f}'.
                                                format(ttype, 
                                                       np.mean([len(x.split()) for x in docs]),
                                                       np.mean([len(nltk.tokenize.sent_tokenize(x)) 
                                                                for x in docs])))

type: degree_2	avg. nr. words: 120.16, avg nr. sentences: 7.15
type: degree_3	avg. nr. words: 178.86, avg nr. sentences: 10.73
type: degree_4	avg. nr. words: 235.87, avg nr. sentences: 14.12
type: degree_4_split	avg. nr. words: 239.16, avg nr. sentences: 14.27
type: degree_8	avg. nr. words: 474.66, avg nr. sentences: 28.19
type: degree_8_split	avg. nr. words: 474.68, avg nr. sentences: 28.15
type: original	avg. nr. words: 58.74, avg nr. sentences: 3.59


In [16]:
# rough estimate of number of tokens
df['sequence_length'] = df['doc'].apply(lambda x: len(x.split()))
df.groupby(by='type').describe()

Unnamed: 0_level_0,sequence_length,sequence_length,sequence_length,sequence_length,sequence_length,sequence_length,sequence_length,sequence_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
degree_2,121828.0,120.735841,34.965585,28.0,94.0,115.0,142.0,337.0
degree_3,121828.0,180.817144,43.868438,47.0,148.0,175.0,208.0,429.0
degree_4,121828.0,240.782119,52.09666,92.0,202.0,235.0,273.0,537.0
degree_4_split,121828.0,240.663698,52.123395,85.0,202.0,235.0,273.0,541.0
degree_8,121828.0,480.62475,81.435538,241.0,423.0,472.0,529.0,932.0
degree_8_split,121828.0,480.63345,81.436908,241.0,423.0,472.0,529.0,932.0
original,274113.0,60.068913,24.712923,1.0,43.0,53.0,74.0,229.0


In [17]:
df['query'].apply(lambda x: len(x.split())).describe()

count    1.005081e+06
mean     5.873625e+00
std      2.376121e+00
min      2.000000e+00
25%      4.000000e+00
50%      6.000000e+00
75%      7.000000e+00
max      3.000000e+01
Name: query, dtype: float64