# Download class code from Github fork

In [1]:
import os
from getpass import getpass
import urllib

user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
repo_name = "aljubrmj/mj-nlp-fp.git"
# Example
# repo_name = "gregdurrett/nlp-qa-finalproj.git"
cmd_string = 'git clone https://{0}:{1}@github.com/{2}'.format(user, password, repo_name)

!{cmd_string}

User name: aljubrmj
Password: ··········
Cloning into 'mj-nlp-fp'...
remote: Enumerating objects: 53, done.[K
remote: Total 53 (delta 0), reused 0 (delta 0), pack-reused 53[K
Unpacking objects: 100% (53/53), done.


In [8]:
%cd /content/mj-nlp-fp/

/content/mj-nlp-fp


# 2. Download and prepare data as per class project instructions

In [3]:
!bash ./setup.sh

Illegal option -s
Usage: /usr/bin/which [-a] args
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 26.3M  100 26.3M    0     0  63.9M      0 --:--:-- --:--:-- --:--:-- 63.9M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 53.8M  100 53.8M    0     0  77.4M      0 --:--:-- --:--:-- --:--:-- 77.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3392k  100 3392k    0     0  13.2M      0 --:--:-- --:--:-- --:--:-- 13.2M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3069k  100 3069k    0     0  14.2M      0 --:--:-- --:--:-- --:--:-- 14.1M
  

In [10]:
import os
import json
import gzip
import pickle
from utils import *
import torch
from tqdm import tqdm

In [None]:
dataset_names = ["squad_train", "squad_dev", "squad_adversarial_addonesent"]
class_datasets = {}
for dataset_name in dataset_names: 
  meta, elems = load_dataset(f"/content/mj-nlp-fp/datasets/{dataset_name}.jsonl.gz")
  class_ids, class_qs = [], []
  for elem in elems:
      for qa in elem['qas']:
        if dataset_name == 'squad_adversarial_addonesent':
          class_ids.append(qa['qid'])
        else:
          class_ids.append(qa['id'])
        class_qs.append(qa['question'].lower().strip())

  class_datasets[dataset_name] = (class_ids, class_qs)

In [31]:
for dataset_name in dataset_names:
  print(f"Size of class dataset {dataset_name}: {len(class_datasets[dataset_name][0])}")

Size of class dataset squad_train: 86588
Size of class dataset squad_dev: 10507
Size of class dataset squad_adversarial_addonesent: 1787


## Download data from HuggingFace API

In [None]:
# !pip install git+https://github.com/huggingface/transformers
!pip install datasets

In [28]:
from datasets import load_dataset as load_dataset_hf
hf_squad_datasets = load_dataset_hf("squad")
hf_squad_adverserial_datasets = load_dataset_hf("squad_adversarial", "AddOneSent")

hf_datasets = {}
hf_datasets[dataset_names[0]] = [(i, j.lower().strip()) for i,j in zip(hf_squad_datasets['train']['id'], hf_squad_datasets['train']['question'])]
hf_datasets[dataset_names[1]] = [(i, j.lower().strip()) for i,j in zip(hf_squad_datasets['validation']['id'], hf_squad_datasets['validation']['question'])]
hf_datasets[dataset_names[2]] = [(i, j.lower().strip()) for i,j in zip(hf_squad_adverserial_datasets['validation']['id'], hf_squad_adverserial_datasets['validation']['question'])]

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/4fffa6cf76083860f85fa83486ec3028e7e32c342c218ff2a620fc6b2868483a)


Downloading and preparing dataset squad_adversarial/AddOneSent (download: 5.72 MiB, generated: 1.78 MiB, post-processed: Unknown size, total: 7.50 MiB) to /root/.cache/huggingface/datasets/squad_adversarial/AddOneSent/1.1.0/e9df92c060f50eb529284b303c504bf4359ba37944faebe7a16a91b7d534e946...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset squad_adversarial downloaded and prepared to /root/.cache/huggingface/datasets/squad_adversarial/AddOneSent/1.1.0/e9df92c060f50eb529284b303c504bf4359ba37944faebe7a16a91b7d534e946. Subsequent calls will reuse this data.


In [33]:
for dataset_name in dataset_names:
  print(f"Size of HuggingFace dataset {dataset_name}: {len(hf_datasets[dataset_name])}")

Size of HuggingFace dataset squad_train: 87599
Size of HuggingFace dataset squad_dev: 10570
Size of HuggingFace dataset squad_adversarial_addonesent: 1787


# Comapare ID and Question text in class SQuAD dataset versus the HuggingFace API SQuAD dataset

### Note: While the HuggingFace API shows more exapmles in the SQuAD dataset, we will only keep those examples which appear in the class dataset to be consistent within this report and in relation to other class projects

In [30]:
missing_ids = {dataset_name: [] for dataset_name in dataset_names}
for dataset_name in dataset_names: 
  class_ids, class_qs = class_datasets[dataset_name]
  hf_ids_qs = hf_datasets[dataset_name]
  for i, hf_id_q in enumerate(hf_ids_qs):
    hf_id, hf_q = hf_id_q
    if (hf_id not in class_ids):
      # print(i)
      # print(class_id)
      # print(hf_id)
      # print(hf_q)
      missing_ids[dataset_name].append(hf_id)
    elif hf_q != class_qs[class_ids.index(hf_id)]:
      missing_ids[dataset_name].append(hf_id)

  print(f"Total Number of missing IDs in {dataset_name}: {len(missing_ids[dataset_name])}")

Total Number of missing IDs in squad_train: 1011
Total Number of missing IDs in squad_dev: 63
Total Number of missing IDs in squad_adversarial_addonesent: 0
