Skip to content

Commit

Permalink
Add utils script to convert pandas df (title, content) to SQuAD #13
Browse files Browse the repository at this point in the history
  • Loading branch information
fmikaelian committed Feb 15, 2019
1 parent 15fb12a commit 487ad18
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 0 deletions.
6 changes: 6 additions & 0 deletions examples/run_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import pandas as pd
from reading-comprehension.utils.converter import df2squad

df = pd.read_csv('data.csv')

json_data = df2squad(df=df, version='v2.0', output_dir='./')
38 changes: 38 additions & 0 deletions reading-comprehension/utils/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import json
import os

def df2squad(df, version='v2.0', output_dir=None):
"""
Converts a pandas dataframe with columns ['title', 'content'] to a json file with SQuAD format.
Arguments:
df {pd.DataFrame} -- a pandas dataframe with columns ['title', 'content']
version {string} -- the SQuAD dataset version format
Keyword Arguments:
output_dir {string} -- Enable export of output. (default: {None})
Returns:
json_data -- A json object with SQuAD format
"""

json_data = {}
json_data['version'] = version
json_data['data'] = []

for index, row in df.iterrows():
temp = {'title': row['title'],
'paragraphs': []}
paragraphs_list = row['content'].replace("\t", "").replace(
'\xa0', '').replace("\r", "").split("\n")
paragraphs_list = [x for x in paragraphs_list if x != '']
for paragraph in paragraphs_list:
temp['paragraphs'].append({'context': paragraph,
'qas': []})
json_data['data'].append(temp)

if output_dir:
with open(os.path.join(output_dir, 'custom-train-{}.json'.format(version)), 'w') as outfile:
json.dump(json_data, outfile)

return json_data

0 comments on commit 487ad18

Please sign in to comment.