# IDK : Incomplete Data Knowledge base question answering

Construction of the IDK dataset, introduced in the paper *Question Answering when Knowledge Bases are Incomplete*.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/camillepradel/idk/blob/master/IDK_build_dataset.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/camillepradel/idk/blob/master/IDK_build_dataset.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Citing
If you used our dataset, please kindly cite our paper

```
@inproceedings{pradel-2020-idk,
    title = "Question Answering when Knowledge Bases are Incomplete",
    author = "Pradel, Camille and Sileo, Damien and Rodrigo, Álvaro and Peñas, Anselmo and Agirre, Eneko",
    booktitle = "Proceedings of the Eleventh International Conference of the CLEF Association (CLEF 2020)",
    year = "2020"
}
```
and the original work
```
@inproceedings{Yu&al.18c,
  title     = {Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task},
  author    = {Tao Yu and Rui Zhang and Kai Yang and Michihiro Yasunaga and Dongxu Wang and Zifan Li and James Ma and Irene Li and Qingning Yao and Shanelle Roman and Zilin Zhang and Dragomir Radev}
  booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
  address   = "Brussels, Belgium",
  publisher = "Association for Computational Linguistics",
  year      = 2018
}
```

## Setup

In [None]:
import os
import pandas as pd
import sqlite3
import sqlparse
import random
import json
from tqdm.notebook import tqdm
import shutil
from enum import Enum

In [None]:
base_path = os.getcwd()
spider_repo_path = f"{base_path}/spider"
spider_data_path = f"{base_path}/dataset"
altered_spider_data_path = f"{base_path}/altered_dataset"
original_databases_path = f"{spider_data_path}/database"
altered_databases_path = f"{altered_spider_data_path}/database"
altered_dev_path = f"{altered_spider_data_path}/dev.json"

dev_path = f"{spider_data_path}/dev.json"
train_path = f"{spider_data_path}/train_spider.json"
altered_dev_path = f"{altered_spider_data_path}/dev.json"
altered_train_path = f"{altered_spider_data_path}/train_spider.json"
dataset_tables = f"{spider_data_path}/tables.json"
altered_dataset_tables = f"{altered_spider_data_path}/tables.json"

spider_repo_path_bash = spider_repo_path.replace(' ', '\\ ')
spider_data_path_bash = spider_data_path.replace(' ', '\\ ')
altered_spider_data_path_bash = altered_spider_data_path.replace(' ', '\\ ')
original_databases_path_bash = original_databases_path.replace(' ', '\\ ')
altered_databases_path_bash = altered_databases_path.replace(' ', '\\ ')
altered_dev_path_bash = altered_dev_path.replace(' ', '\\ ')
dataset_tables_bash = f"{spider_data_path}/tables.json".replace(' ', '\\ ')
altered_dataset_tables_bash = f"{altered_spider_data_path}/tables.json".replace(' ', '\\ ')

In [None]:
# dowload Spider original dataset
# more info about dataset at https://yale-lily.github.io/spider
# dataset is hosted on a public google drive repo: https://drive.google.com/uc?export=download&id=11icoH_EA-NYb0OrPTdehRWm_d7-DIzWX
# we use method from https://www.matthuisman.nz/2019/01/download-google-drive-files-wget-curl.html to download it
%cd $base_path

!curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id=11icoH_EA-NYb0OrPTdehRWm_d7-DIzWX' | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt
!curl -L -b cookies.txt -o spider.zip 'https://docs.google.com/uc?export=download&id=11icoH_EA-NYb0OrPTdehRWm_d7-DIzWX&confirm='$(<confirm.txt)
!rm -f confirm.txt cookies.txt

!unzip -q spider.zip -d $spider_data_path_bash
%mv $spider_data_path_bash/spider/* $spider_data_path_bash/
%rm -r $spider_data_path_bash/spider
%rm spider.zip

In [None]:
# clone project and reset to given commit hash for reproducibility
!git clone https://github.com/taoyds/spider.git $spider_repo_path_bash
%cd $spider_repo_path_bash
!git reset --hard 2b663fb9e77e079dd468086cbc16802fc149b36e

In [None]:
#@title
# apply necessary edits to spider/preprocess/get_tables.py:
#  - remove unused nltk import
#  - fix tab problems
#  - catch "expected" exception
#  - ...
%cd $spider_repo_path_bash
patch_file_content = """diff --git a/preprocess/get_tables.py b/preprocess/get_tables.py
index a8b36f0..7a7156a 100644
--- a/preprocess/get_tables.py
+++ b/preprocess/get_tables.py
@@ -4,7 +4,7 @@ import json
 import sqlite3
 from os import listdir, makedirs
 from os.path import isfile, isdir, join, split, exists, splitext
-from nltk import word_tokenize, tokenize
+# from nltk import word_tokenize, tokenize
 import traceback
 
 EXIST = {"atis", "geo", "advising", "yelp", "restaurants", "imdb", "academic"}
@@ -26,7 +26,7 @@ def convert_fk_index(data):
             if ref_cid and cid:
                 fk_holder.append([cid, ref_cid])
         except:
-""" + '\t' + """    traceback.print_exc()
+            traceback.print_exc()
             print "table_names_original: ", data['table_names_original']
             print "finding tab name: ", tn, ref_tn
             sys.exit()
@@ -46,6 +46,7 @@ def dump_db_json_schema(db, f):
          'column_names_original': [(-1, '*')],
          'column_names': [(-1, '*')],
          'column_types': ['text'],
+         'column_uninterpreted_types': ['IGNORE'],
          'primary_keys': [],
          'foreign_keys': []}
 
@@ -56,7 +57,7 @@ def dump_db_json_schema(db, f):
         data['table_names'].append(table_name.lower().replace("_", ' '))
         fks = conn.execute("PRAGMA foreign_key_list('{}') ".format(table_name)).fetchall()
         #print("db:{} table:{} fks:{}".format(f,table_name,fks))
-""" + '\t' + """fk_holder.extend([[(table_name, fk[3]), (fk[2], fk[4])] for fk in fks])
+        fk_holder.extend([[(table_name, fk[3]), (fk[2], fk[4])] for fk in fks])
         cur = conn.execute("PRAGMA table_info('{}') ".format(table_name))
         for j, col in enumerate(cur.fetchall()):
             data['column_names_original'].append((i, col[1]))
@@ -75,7 +76,9 @@ def dump_db_json_schema(db, f):
             else:
                 data['column_types'].append('others')
 
-            if col[5] == 1:
+            data['column_uninterpreted_types'].append(col_type)
+
+            if col[5] >= 1:
                 data['primary_keys'].append(len(data['column_names'])-1)
 
     data["foreign_keys"] = fk_holder
@@ -111,15 +116,19 @@ if __name__ == '__main__':
         db = join(input_dir, df, f)
         print '\\nreading new db: ', df
         table = dump_db_json_schema(db, df)
-        prev_tab_num = len(ex_tabs[df]["table_names"])
-        prev_col_num = len(ex_tabs[df]["column_names"])
-        cur_tab_num = len(table["table_names"])
-        cur_col_num = len(table["column_names"])
-        if df in ex_tabs.keys() and prev_tab_num == cur_tab_num and prev_col_num == cur_col_num and prev_tab_num != 0 and len(ex_tabs[df]["column_names"]) > 1:
-            table["table_names"] = ex_tabs[df]["table_names"]
-            table["column_names"] = ex_tabs[df]["column_names"]
-        else:
-            print "\\n----------------------------------problem db: ", df
+        try:
+            prev_tab_num = len(ex_tabs[df]["table_names"])
+            prev_col_num = len(ex_tabs[df]["column_names"])
+            cur_tab_num = len(table["table_names"])
+            cur_col_num = len(table["column_names"])
+            if df in ex_tabs.keys() and prev_tab_num == cur_tab_num and prev_col_num == cur_col_num and prev_tab_num != 0 and len(ex_tabs[df]["column_names"]) > 1:
+                table["table_names"] = ex_tabs[df]["table_names"]
+                table["column_names"] = ex_tabs[df]["column_names"]
+            else:
+                print "\\n----------------------------------problem db: ", df
+        except KeyError:
+            # considered db was not known by previous tables file; this is not a problem
+            pass
         tables.append(table)
     print "final db num: ", len(tables)
     with open(output_file, 'wt') as out:
"""
with open("get_tables.patch", "w") as f: 
  f.write(patch_file_content)
!git apply get_tables.patch

In [None]:
%mkdir $altered_spider_data_path_bash

## Alter databases from Spider dataset

```
for each database
  randomly delete COLUMN_DELETE_RATE of non primary keys columns
  randomly delete ROW_DELETE_RATE of rows (TODO)
```

In [None]:
# regenerate tables.json for original dataset
%cd $spider_repo_path_bash
!echo "[]" > $spider_data_path_bash/tables_empty.json
!python2 preprocess/get_tables.py $original_databases_path_bash $dataset_tables $spider_data_path_bash/tables_empty.json

In [None]:
COLUMN_DELETE_RATE = 0.2

In [None]:
def get_tables_dict(tables_file_path):
  tables_dict = {}
  with open(tables_file_path) as json_file:
    tables = json.load(json_file)
    for db_tables in tables:
      db_id = db_tables['db_id']
      tables_dict[db_id] = db_tables
  return tables_dict

tables_dict = get_tables_dict(dataset_tables)

In [None]:
def get_table_names(db_name, tables_dict):
  return tables_dict[db_name]['table_names_original']

def get_primary_keys_columns_names_and_types(db_name, table_name, tables_dict):
  table_dict = tables_dict[db_name]
  column_names = [
                   {
                     'table_name': table_name,
                     'column_name': col[1],
                     'type': table_dict['column_types'][idx],
                     'uninterpreted_type': table_dict['column_uninterpreted_types'][idx],
                     'foreign_key': None,
                     'primary_key': True,
                   } 
                   for idx, col in enumerate(table_dict['column_names_original'])
                   if col[0]>=0
                   and table_dict['table_names_original'][col[0]]==table_name
                   and idx in table_dict['primary_keys']
                 ]
  return column_names

def get_column_name(table_dict, column_index):
  return table_dict['column_names_original'][column_index][1]

def get_table_and_column_names(table_dict, column_index):
  return {
      'table': table_dict['table_names_original'][table_dict['column_names_original'][column_index][0]],
      'column': get_column_name(table_dict, column_index),
  }

def get_non_primary_keys_columns_descriptions(db_name, table_name, tables_dict):
  table_dict = tables_dict[db_name]
  foreign_keys = { get_column_name(table_dict, fk[0]) : get_table_and_column_names(table_dict, fk[1])
                    for fk in table_dict['foreign_keys']}
  column_descriptions = [
                   {
                     'table_name': table_name,
                     'column_name': column_name,
                     'type': table_dict['column_types'][idx],
                     'uninterpreted_type': table_dict['column_uninterpreted_types'][idx],
                     'foreign_key': foreign_keys[column_name] if column_name in foreign_keys else None,
                     'primary_key': False,
                   } 
                   for idx, (table_index, column_name) in enumerate(table_dict['column_names_original'])
                   if table_index>=0
                      and table_dict['table_names_original'][table_index]==table_name
                      and idx not in table_dict['primary_keys']
                 ]
  return column_descriptions

def delete_columns(db, table_name, columns_to_keep):
  names = ', '.join([col["column_name"] for col in columns_to_keep])
  names_and_types = ',\n'.join([f'"{col["column_name"]}" {col["uninterpreted_type"]}' for col in columns_to_keep])
  primary_keys = ', '.join(
      [
        f'"{col["column_name"]}"'
        for col in columns_to_keep
        if col['primary_key']==True
      ])
  if len(primary_keys) > 0:
    primary_keys = f',\nprimary key({primary_keys})\n'
  foreign_keys = ',\n'.join(
      [
        f'foreign key("{col["column_name"]}") references "{col["foreign_key"]["table"]}"("{col["foreign_key"]["column"]}")'
        for col in columns_to_keep
        if col['foreign_key']!=None
      ])
  if len(foreign_keys) > 0:
    foreign_keys = f',\n{foreign_keys}\n'
  command = f"CREATE TEMPORARY TABLE {table_name}_backup({names_and_types});\n" \
          + f"INSERT INTO {table_name}_backup SELECT {names} FROM {table_name};\n" \
          + f"DROP TABLE {table_name};\n" \
          + f"CREATE TABLE \"{table_name}\" (\n" \
          + f"{names_and_types}" \
          + f"{primary_keys}" \
          + f"{foreign_keys}" \
          + f");\n" \
          + f"INSERT INTO {table_name} SELECT {names} FROM {table_name}_backup;\n" \
          + f"DROP TABLE {table_name}_backup;"
  db.executescript(command)
  db.commit()

def randomly_delete_columns(db, db_name, tables_dict):
    delete_columns_report = {'deleted_columns': []}
    table_names = get_table_names(db_name, tables_dict)
    for table_name in table_names:
      columns_to_keep = get_non_primary_keys_columns_descriptions(db_name, table_name, tables_dict)
      for column in columns_to_keep:
        if random.random() < COLUMN_DELETE_RATE:
          columns_to_keep.remove(column)
          delete_columns_report['deleted_columns'].append(column)
      columns_to_keep.extend(get_primary_keys_columns_names_and_types(db_name, table_name, tables_dict))
      delete_columns(db, table_name, columns_to_keep)
    return delete_columns_report

def write_schema(db, out_file_path):
  with open(out_file_path, 'w') as f:
    for line in db.iterdump():
      f.write('%s\n' % line)

def alter_database(db_name, database_folder_path, database_file_path, tables_dict):
  with sqlite3.connect(database_file_path) as db:
    original_schema_path = os.path.join(database_folder_path, 'original_schema.sql')
    altered_schema_path = os.path.join(database_folder_path, 'schema.sql')
    # dump_diff_path_bash = os.path.join(database_folder_path, 'dump.diff').replace(' ', '\\ ')
    alteration_report_path = os.path.join(database_folder_path, 'alteration_report.json')

    alterations_report = {}

    # write schema before alteration
    write_schema(db, original_schema_path)

    alterations_report['delete_columns'] = randomly_delete_columns(db, db_name, tables_dict)
    # TODO: randomly delete rows

    # write schema after alteration
    write_schema(db, altered_schema_path)

    # write alterations report
    with open(alteration_report_path, 'w') as outfile:
      json.dump(alterations_report, outfile, indent=4)

    # # print database diff
    # !diff $original_dump_path $altered_dump_path > $dump_diff_path_bash

In [None]:
# clone original databases to altered databases path before altering them
%rm -rf $altered_databases_path_bash
%cp -r $original_databases_path_bash $altered_databases_path_bash

random.seed(0)

with tqdm(os.listdir(original_databases_path)) as db_it:
  for database_folder in db_it:
    db_it.set_postfix_str(database_folder)
    database_folder_path = os.path.join(altered_databases_path, database_folder)
    database_file_path = os.path.join(database_folder_path, f'{database_folder}.sqlite')
    try:
      alter_database(database_folder, database_folder_path, database_file_path, tables_dict)
    except Exception as e:
      tqdm.write(f'Error occured while processing database {database_folder}: {e}')

## Identify questions which are affected by dataset alteration

```
for each dataset question and its SQL query translation
  define_answerable_status(SQL query, altered target database)

function define_answerable_status(SQL query, altered target database):
  find out whether at least one of the columns deleted from the target database was used in the SELECT statement of the SQL query
  find out whether at least one of the columns deleted from the target database was used in the WHERE statement of the SQL query
  find out whether at least one of the columns deleted from the target database was used in the HAVING statement of the SQL query
  find out whether at least one of the columns deleted from the target database was used in the FROM statement of the SQL query
  find out the impact of row deletion (TODO)
  if any column was used in the SELECT, HAVING, WHERE or FROM statement of the database:
    return False
  else:
    return True
```

In [None]:
altered_spider_data_path_bash

In [None]:
# generate tables.json for altered dataset
%cd $spider_repo_path_bash
!echo "[]" > $altered_spider_data_path_bash/tables_empty.json
!python2 preprocess/get_tables.py $altered_databases_path $altered_dataset_tables_bash $altered_spider_data_path_bash/tables_empty.json

In [None]:
df_dev = pd.read_json(dev_path)
df_train = pd.read_json(train_path)
df_dev["split"]="dev"
df_train["split"]="train"
df = pd.concat([df_dev,df_train], ignore_index=True)
df['column_not_available_for_select'] = None
df['column_not_available_for_where'] = None
df['column_not_available_for_having'] = None
df['column_not_available_for_from'] = None

tables_dict = get_tables_dict(dataset_tables)
altered_tables_dict = get_tables_dict(altered_dataset_tables)

In [None]:
# description of sql_dict format: https://github.com/taoyds/spider/blob/master/preprocess/parsed_sql_examples.sql

def get_column_full_names_from_indices(column_indices, db_tables):
    full_names = [('_' if db_tables['column_names_original'][column_index][0]<0 else db_tables['table_names_original'][db_tables['column_names_original'][column_index][0]],
             db_tables['column_names_original'][column_index][1])
            for column_index in column_indices]
    full_names = [full_name for full_name in full_names if full_name[1] != '*']
    return set(full_names)

def get_column_ids_used_in_col_unit(col_unit):
    (_, col_id, _) = col_unit
    return [col_id]

def get_column_ids_used_in_val_unit(val_unit):
    columns = set()
    (_, col_unit1, col_unit2) = val_unit
    for col_unit in [col_unit1, col_unit2]:
        if col_unit:
            columns.update(get_column_ids_used_in_col_unit(col_unit))
    return columns

def get_column_ids_used_in_condition(condition):
    columns = set()
    for cond_unit_and_or in condition:
        if isinstance(cond_unit_and_or, list):
            (_, _, val_unit, val1, val2) = cond_unit_and_or
            columns.update(get_column_ids_used_in_val_unit(val_unit))
            for val in [val1, val2]:
                if isinstance(val, list):
                    columns.add(val[1])
    return columns

def get_columns_used_in_select(sql_dict, db_tables):
    columns = set()
    _, selects = sql_dict['select']
    for (_, val_unit) in selects:
        columns.update(get_column_ids_used_in_val_unit(val_unit))
    return get_column_full_names_from_indices(columns, db_tables)

def get_columns_used_in_where(sql_dict, db_tables):
    columns = set()
    condition = sql_dict['where']
    columns.update(get_column_ids_used_in_condition(condition))
    return get_column_full_names_from_indices(columns, db_tables)

def get_columns_used_in_having(sql_dict, db_tables):
    columns = set()
    condition = sql_dict['having']
    columns.update(get_column_ids_used_in_condition(condition))
    return get_column_full_names_from_indices(columns, db_tables)

def get_columns_used_in_from(sql_dict, db_tables):
    columns = set()
    from_dict = sql_dict['from']
    condition = from_dict['conds']
    columns.update(get_column_ids_used_in_condition(condition))
    return get_column_full_names_from_indices(columns, db_tables)

def get_columns_from_db(db_tables):
    columns = set(range(1, len(db_tables['column_names_original'])))
    return get_column_full_names_from_indices(columns, db_tables)

def get_columns_not_used_in_query(sql_dict, db_tables):
    columns = get_columns_from_db(db_tables)
    columns -= get_columns_used_in_select(sql_dict, db_tables)
    columns -= get_columns_used_in_where(sql_dict, db_tables)
    columns -= get_columns_used_in_having(sql_dict, db_tables)
    columns -= get_columns_used_in_from(sql_dict, db_tables)
    return get_column_full_names_from_indices(columns, db_tables)

# test above functions on some examples
for query_id in [1008, 1012, 1018, 1020, 1022]:
    row = df.iloc[query_id]
    print(query_id)
    print(row.question)
    print(row.query)
    print('columns_used_in_select', get_columns_used_in_select(row.sql, tables_dict[row.db_id]))
    print('columns_used_in_where', get_columns_used_in_where(row.sql, tables_dict[row.db_id]))
    print('columns_used_in_having', get_columns_used_in_having(row.sql, tables_dict[row.db_id]))
    print('columns_used_in_from', get_columns_used_in_from(row.sql, tables_dict[row.db_id]))
    print('get_columns_from_db', get_columns_from_db(tables_dict[row.db_id]))
    # print('columns_NOT_used_in_query', get_columns_not_used_in_query(row.sql, tables_dict[row.db_id]))
    print('\n')

In [None]:
def set_column_not_available(df, index, altered_tables_dict, columns_used, label_name):
  row = df.iloc[index]
  columns_from_db = get_columns_from_db(altered_tables_dict[row.db_id])  
  for column_used in columns_used:
    if column_used not in columns_from_db:
      df.loc[index, label_name] = True
      return      
  df.loc[index, label_name] = False
  return

def set_column_not_available_for_select(df, index, tables_dict, altered_tables_dict):
  row = df.iloc[index]
  columns_used_in_select = get_columns_used_in_select(row.sql, tables_dict[row.db_id])
  set_column_not_available(df, index, altered_tables_dict, columns_used_in_select, 'column_not_available_for_select')

def set_column_not_available_for_where(df, index, tables_dict, altered_tables_dict):
  row = df.iloc[index]
  columns_used_in_where = get_columns_used_in_where(row.sql, tables_dict[row.db_id])
  set_column_not_available(df, index, altered_tables_dict, columns_used_in_where, 'column_not_available_for_where')

def set_column_not_available_for_having(df, index, tables_dict, altered_tables_dict):
  row = df.iloc[index]
  columns_used_in_having = get_columns_used_in_having(row.sql, tables_dict[row.db_id])
  set_column_not_available(df, index, altered_tables_dict, columns_used_in_having, 'column_not_available_for_having')

def set_column_not_available_for_from(df, index, tables_dict, altered_tables_dict):
  row = df.iloc[index]
  columns_used_in_from = get_columns_used_in_from(row.sql, tables_dict[row.db_id])
  set_column_not_available(df, index, altered_tables_dict, columns_used_in_from, 'column_not_available_for_from')

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
  set_column_not_available_for_select(df, index, tables_dict, altered_tables_dict)
  set_column_not_available_for_where(df, index, tables_dict, altered_tables_dict)
  set_column_not_available_for_having(df, index, tables_dict, altered_tables_dict)
  set_column_not_available_for_from(df, index, tables_dict, altered_tables_dict)

df["answerability"] = ~df[[c for c in df.columns if "column_not_available_" in c]].max(axis=1).map(bool)

df

In [None]:
def read_alteration_report(db_id):
  alteration_report_path = os.path.join(altered_databases_path, db_id, 'alteration_report.json')
  alteration_report = {}
  with open(alteration_report_path) as json_file:
    alteration_report = json.load(json_file)
  return alteration_report

def print_consequence_of_alteration(i, df):
  db_id = df.loc[i, 'db_id']
  alteration_report = read_alteration_report(db_id)
  print(f'query index: {i}')
  print(f'NL question: {df.loc[i, "question"]}')
  print(f'SQL query: {df.loc[i, "query"]}')
  print(f'Alterations on target DB ({db_id}):')
  print(f' - deleted columns: {", ".join([col["table_name"]+"."+col["column_name"] for col in alteration_report["delete_columns"]["deleted_columns"]])}')
  print(f'Consequences:')
  print(f' - column_not_available_for_select: {df.loc[i, "column_not_available_for_select"]}')
  print(f' - column_not_available_for_where:  {df.loc[i, "column_not_available_for_where"]}')
  print(f' - column_not_available_for_having: {df.loc[i, "column_not_available_for_having"]}')
  print(f' - column_not_available_for_from:   {df.loc[i, "column_not_available_for_from"]}')
  print(f' -> {"NOT" if not df.loc[i, "answerability"] else ""} ANSWERABLE')

indices = [0, 1, 8030, 8031, 8032]
for i in indices:
  print_consequence_of_alteration(i, df)
  print()

In [None]:
# ratio of answerable questions
df.answerability.value_counts()[True]/len(df)

In [None]:
with open(altered_train_path, 'w') as outfile:
  json.dump(json.loads(df[df["split"]=="train"].to_json(orient='records')), outfile, indent=4)
  
with open(altered_dev_path, 'w') as outfile:
  json.dump(json.loads(df[df["split"]=="dev"].to_json(orient='records')), outfile, indent=4)

In [None]:
!du -hcs $altered_spider_data_path_bash

In [None]:
%cd $base_path
!7z -a idk_dataset.7z $altered_spider_data_path_bash

In [None]:
!du -h idk_dataset.7z