In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import time, json
import sys
sys.path.append("../../")
import os
import torch
import numpy as np
from tqdm import tqdm

import logging
from src.utils import logging_utils
from src.utils import env_utils, experiment_utils
from src import functional
import wandb

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")

2024-10-29 19:38:25 __main__ INFO     torch.__version__='2.5.0+cu124', torch.version.cuda='12.4'


In [32]:
from src.dataset_manager import DatasetManager
DatasetManager.list_datasets_by_group()

{'geometry_of_truth': ['sp_en_trans',
  'cities',
  'neg_cities',
  'smaller_than',
  'larger_than',
  'common_claim_true_false',
  'companies_true_false'],
 'relations': ['commonsense/word_sentiment',
  'commonsense/fruit_outside_color',
  'commonsense/task_done_by_person',
  'commonsense/work_location',
  'commonsense/task_done_by_tool',
  'commonsense/substance_phase',
  'commonsense/object_superclass',
  'factual/country_capital_city',
  'factual/person_plays_pro_sport',
  'factual/country_language',
  'factual/country_largest_city',
  'factual/food_from_country',
  'factual/landmark_in_country',
  'factual/superhero_archnemesis',
  'factual/city_in_country',
  'factual/superhero_person',
  'factual/person_plays_instrument',
  'factual/country_currency',
  'factual/person_plays_position_in_sport',
  'linguistic/word_last_letter',
  'linguistic/verb_past_tense',
  'linguistic/word_first_letter',
  'linguistic/adj_superlative',
  'linguistic/adj_comparative',
  'linguistic/adj_antony

In [34]:
dataloader = DatasetManager.from_named_datasets(
    # [("geometry_of_truth", "cities")],
    # [("sst2", "sst2")],
    # [("relations", 'factual/country_capital_city')],
    [("tense", "tense")],
    batch_size=5
)
batch = next(iter(dataloader))
batch

[ContextQASample(context='We have completed the marathon race.', questions=['# Can we classify this sentence as being in the future tense?', '# Is this statement in the future tense?', '# Would you say this is written in the past tense?', '# Does this text reflect the future tense?', '# Would you say this is written in the past tense?', '# Does this sentence convey the present tense?', '# Would you say this is written in the present tense?', '# Is the narrative presented in the future tense?', '# Does this sentence use the future tense?', '# Is the verb form in this sentence present?'], answers=['No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes']),
 ContextQASample(context='The researchers have been conducting experiments to test their hypothesis.', questions=['# Is this statement in the future tense?', '# Would you say this is written in the past tense?', '# Would you say this is written in the future tense?', '# Would you say this is written in the present tense?', '# Am I

In [9]:
import pandas as pd
root = os.path.join(env_utils.DEFAULT_DATA_DIR, "tense")
df = pd.read_csv(os.path.join(root, "tense.csv"), encoding='ISO-8859-1')
df.head()

Unnamed: 0,sentence,tense
0,I am eating breakfast,present
1,She will go to the park,future
2,They played soccer yesterday,past
3,I will be going to the concert,future
4,She is eating lunch now,present


In [10]:
classes = ["present", "past", "future"]
def trim_label_to_class(label):
    for cls in classes:
        if cls in label.strip().lower():
            return cls
    assert False, f"Could not find class in label: {label}"

ds = []
for idx, row in df.iterrows():
    ds.append((row["sentence"].strip(), trim_label_to_class(row["tense"])))
ds[:5]

[('I am eating breakfast', 'present'),
 ('She will go to the park', 'future'),
 ('They played soccer yesterday', 'past'),
 ('I will be going to the concert', 'future'),
 ('She is eating lunch now', 'present')]

In [11]:
import random

pos_root = os.path.join(env_utils.DEFAULT_DATA_DIR, "tense/tense_wise_sentence")
pos_ds = []

for label_csv in os.listdir(pos_root):
    label = trim_label_to_class(label_csv)
    with open(os.path.join(pos_root, label_csv), "r") as f:
        for line in f:
            pos_ds.append((line.strip(), label))

random.shuffle(pos_ds)
pos_ds[:5]

[('We have gone on a road trip to a national park.', 'present'),
 ('I have been waiting for the train.', 'present'),
 ('The rain brought much-needed relief to the drought.', 'past'),
 ('We explored a cave on our adventure.', 'past'),
 ('She performed an impressive dance routine at the competition.', 'past')]

In [12]:
full_ds = list(set(ds + pos_ds))
print(len(full_ds))

ds_processed = []
for sentence, label in full_ds:
    ds_processed.append({
        "sentence": sentence.replace("\"", ""),
        "label": label
    })

random.shuffle(ds_processed)

with open(os.path.join(root, "tense_processed.json"), "w") as f:
    json.dump(ds_processed, f)

6748


In [30]:
from src.dataset_manager import DatasetLoader, ContextQASample, NUM_QA_PER_SAMPLE, YES_TOKEN, NO_TOKEN
import os
from src.utils import env_utils
import json
import random
from src.dataset_manager import TenseDatasetLoader
    
tense_loader = TenseDatasetLoader()
tense_ds = tense_loader.load()

In [31]:
tense_ds[2].__dict__

{'context': 'By the end of the decade, renewable energy will be accessible to everyone.',
 'questions': ['# Is the action described here in the past tense?',
  '# Is the narrative presented in the future tense?',
  '# Is the narrative presented in the future tense?',
  '# Does this sentence use the future tense?',
  '# Would you identify this as an example of the future tense?',
  '# Am I correct in saying this is in the past tense?',
  '# Am I correct in saying this is in the present tense?',
  '# Would you say this is written in the present tense?',
  '# Is the action described here in the future tense?',
  '# Would you say this is written in the past tense?'],
 'answers': ['No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No']}