In [89]:
import pandas as pd
import json

In [90]:
data_path = '../data/single-source-all.csv'
manual_path = '../data/manual_anno_done.csv'

In [91]:
wiki_dir = '../data/wiki-pages/'

# read wiki index from json dataframe and define wiki lookup function
with open('../data/wiki_index.json') as f:
    index_data = json.load(f)
    index_items = list(index_data.items())
    wiki_index = pd.DataFrame(index_items, columns=['filename', 'first_entry'])
    wiki_index['file_id'] = wiki_index['filename'].apply(lambda x: int(x.split('.')[0][5:]))


def wiki_file_lookup(wiki_name, wiki_filename):
    # Name to lowercase
    wiki_name = wiki_name.lower()

    # Check for object with id = wiki_name in file. If it exists, return it. If not, return None
    wiki_path = wiki_dir + wiki_filename
    wiki_df = pd.read_json(wiki_path, lines=True)

    # Id colum to lowercase
    wiki_df['id'] = wiki_df['id'].str.lower()
    wiki_df = wiki_df[wiki_df['id'] == wiki_name]

    if len(wiki_df) > 0:
        return wiki_df.iloc[0]['text']
    else:
        return None

def wiki_lookup(wiki_name, lookaround = 0):
    # Loop through first_entry column of wiki_index. If previous entry comes before wiki_name and next entry comes after alphabetically, set seed_id to previous entry
    seed_id = 0
    for i, row in wiki_index.iterrows():
        if row['first_entry'] > wiki_name:
            break
        seed_id = row['file_id']

    min_id = wiki_index['file_id'].min()
    max_id = wiki_index['file_id'].max()

    # Look in file with id = seed_id for wiki_name. If it exists, return it.
    entry = wiki_file_lookup(wiki_name, 'wiki-{:0>3}.jsonl'.format(seed_id))

    if entry:
        return entry

    for window in range(1, lookaround + 1):
        if seed_id - window >= min_id:
            entry = wiki_file_lookup(wiki_name, 'wiki-{:0>3}.jsonl'.format(seed_id - window))
            if entry:
                return entry
        if seed_id + window <= max_id:
            entry = entry or wiki_file_lookup(wiki_name, 'wiki-{:0>3}.jsonl'.format(seed_id + window))
            if entry:
                return entry

    return None 

In [92]:
# read data
df = pd.read_csv(data_path, sep=',', header=0)

# take first 1000 rows
df = df.iloc[:1000]

# read manually annotated data
df_manual = pd.read_csv(manual_path, sep=';', header=0)

# merge manual annotations into data
merged_df = df.merge(df_manual, on='id', how='left', suffixes=('', '_manual'))
df.loc[merged_df['evidence_article_manual'].notnull(), 'evidence_article'] = merged_df['evidence_article_manual']

In [93]:
# add wiki text to data in batches of batch_size, save each batch to file
batch_size = 100

for i in range(0, len(df), batch_size):
    print(i)
    batch_df = df.iloc[i:i+batch_size]
    batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))
    batch_df.to_csv('../data/df_with_text_{}.csv'.format(i), sep=';', header=True, index=False)

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


300


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


400


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


600


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


700


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))


900


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_df['evidence_text'] = batch_df['evidence_article'].apply(lambda x: wiki_lookup(x, 2))
