<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/scifact_extract_arguments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Argument-Counterargument df from SciFact

## Imports

In [1]:
import os
import random
import subprocess
import zipfile
import shutil
import pickle
import pandas as pd
import numpy as np
from google.colab import userdata

### OSF Setup

In [2]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [3]:
import osfclient.cli

In [4]:
from osfclient.api import OSF
from osfclient.models import Project, Storage
from io import BytesIO

In [5]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [6]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [7]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [8]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Data

In [9]:
!osf -p sakjg fetch osfstorage/corpora/scifact_corpus.zip

  0% 0.00/3.14M [00:00<?, ?bytes/s]100% 3.14M/3.14M [00:00<00:00, 108Mbytes/s]


In [10]:
corpus_file_path = 'scifact_corpus.zip'
output_folder_path = 'scifact-corpus'
os.makedirs(output_folder_path, exist_ok=True)

with zipfile.ZipFile(corpus_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_folder_path)

extracted_files = os.listdir(output_folder_path)
print("Files extracted:", extracted_files)

Files extracted: ['scifact_corpus', '__MACOSX']


In [11]:
CORPUS_FOLDER_PATH = "scifact-corpus/scifact_corpus/"

In [84]:
claims_train = pd.read_json(f'{CORPUS_FOLDER_PATH}claims_train.jsonl', lines=True)

In [85]:
claims_train = pd.read_json(f'{CORPUS_FOLDER_PATH}claims_train.jsonl', lines=True)
claims_train = claims_train[claims_train['evidence'] != {}]
claims_train = claims_train.groupby('claim').sample(frac=1).reset_index(drop=True)

In [34]:
corpus = pd.read_json(f'{CORPUS_FOLDER_PATH}corpus.jsonl', lines=True)

In [86]:
claims_pro_evidence = []
claims_counter_evidence = []
for evidence in claims_train['evidence']:
  pro_evidence = []
  counter_evidence = []
  for doc_id in evidence.keys():
    for rationale in evidence[doc_id]:
      corpus_row = corpus[corpus['doc_id'] == int(doc_id)]
      evidence_sentence = ' '.join([corpus_row['abstract'].iloc[0][j] for j in rationale['sentences']])
      pro_evidence.append(evidence_sentence) if rationale['label'] == 'SUPPORT' else counter_evidence.append(evidence_sentence)
  claims_pro_evidence.append(pro_evidence)
  claims_counter_evidence.append(counter_evidence)

In [87]:
claims_pro_evidence_trimmed = [random.choice(claim_list) if claim_list else "" for claim_list in claims_pro_evidence]
claims_counter_evidence_trimmed = [random.choice(claim_list) if claim_list else "" for claim_list in claims_counter_evidence]

In [88]:
claims_train = claims_train.drop(columns=['id', 'evidence', 'cited_doc_ids'])

In [89]:
claims_train = pd.concat([claims_train, pd.DataFrame(claims_pro_evidence_trimmed, columns=['pro_evidence']), pd.DataFrame(claims_counter_evidence_trimmed, columns=['counter_evidence'])], axis=1)

In [90]:
claims_train = claims_train.groupby('claim').sample(1).reset_index(drop=True)

In [119]:
max([len(claim) for claim in claims_train['claim']])

201

In [154]:
claims_train_file_path = 'scifact_train_arguments_dump.pkl'
with open(claims_train_file_path, 'wb') as file:
  pickle.dump(claims_train, file)
  print(f"File uploaded to {claims_train_file_path}")

File uploaded to scifact_train_arguments_dump.pkl


In [146]:
claims_test = pd.read_json(f'{CORPUS_FOLDER_PATH}claims_dev.jsonl', lines=True)
claims_test = claims_test[claims_test['evidence'] != {}]
claims_test = claims_test.groupby('claim').sample(frac=1).reset_index(drop=True)

In [147]:
claims_test_pro_evidence = []
claims_test_counter_evidence = []
for evidence in claims_test['evidence']:
  pro_evidence = []
  counter_evidence = []
  for doc_id in evidence.keys():
    for rationale in evidence[doc_id]:
      corpus_row = corpus[corpus['doc_id'] == int(doc_id)]
      evidence_sentence = ' '.join([corpus_row['abstract'].iloc[0][j] for j in rationale['sentences']])
      pro_evidence.append(evidence_sentence) if rationale['label'] == 'SUPPORT' else counter_evidence.append(evidence_sentence)
  claims_test_pro_evidence.append(pro_evidence)
  claims_test_counter_evidence.append(counter_evidence)

In [149]:
claims_test_pro_evidence_trimmed = [random.choice(claim_list) if claim_list else "" for claim_list in claims_test_pro_evidence]
claims_test_counter_evidence_trimmed = [random.choice(claim_list) if claim_list else "" for claim_list in claims_test_counter_evidence]

In [150]:
claims_test = claims_test.drop(columns=['id', 'evidence', 'cited_doc_ids'])

In [151]:
claims_test = pd.concat([claims_test, pd.DataFrame(claims_test_pro_evidence_trimmed, columns=['pro_evidence']), pd.DataFrame(claims_test_counter_evidence_trimmed, columns=['counter_evidence'])], axis=1)

In [155]:
claims_test_file_path = 'scifact_test_arguments_dump.pkl'
with open(claims_test_file_path, 'wb') as file:
  pickle.dump(claims_test, file)
  print(f"File uploaded to {claims_test_file_path}")

File uploaded to scifact_test_arguments_dump.pkl


In [160]:
train_arguments_dump_file_path = 'scifact-arguments-dump'
train_arguments_dump_zip_path = 'scifact-arguments-dump'
shutil.make_archive(train_arguments_dump_zip_path, 'zip', train_arguments_dump_file_path)
print(f"Zip file created at: {train_arguments_dump_zip_path}")
result = subprocess.run([f"osf -p sakjg upload --force {train_arguments_dump_zip_path}.zip data-dump/scifact_arguments_dump.zip"], shell=True, capture_output=True, text=True)
print(result.stderr)
print(f"File: {train_arguments_dump_zip_path} uploaded at osfstorage")

Zip file created at: scifact-arguments-dump

File: scifact-arguments-dump uploaded at osfstorage


In [157]:
test_arguments_dump_file_path = 'scifact_test_arguments_dump.pkl'
test_arguments_dump_zip_path = 'scifact_test_arguments_dump'
shutil.make_archive(test_arguments_dump_zip_path, 'zip', test_arguments_dump_file_path)
print(f"Zip file created at: {test_arguments_dump_zip_path}")
result = subprocess.run([f"osf -p sakjg upload --force {test_arguments_dump_zip_path}.zip data-dump/scifact_test_arguments_dump.zip"], shell=True, capture_output=True, text=True)
print(result.stderr)
print(f"File: {test_arguments_dump_zip_path} uploaded at osfstorage")

Zip file created at: scifact_test_arguments_dump

File: scifact_test_arguments_dump uploaded at osfstorage
