In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time, json
import sys
sys.path.append("../../")
import os
import torch
import numpy as np
from tqdm import tqdm

import logging
from src.utils import logging_utils
from src.utils import env_utils, experiment_utils
from src import functional
import wandb

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.INFO,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")

  from .autonotebook import tqdm as notebook_tqdm


2024-10-30 11:14:30 __main__ INFO     torch.__version__='2.5.0+cu124', torch.version.cuda='12.4'


In [4]:
from src.models import ModelandTokenizer
mt = ModelandTokenizer(model_key="meta-llama/Llama-3.2-3B")

2024-10-30 11:15:32 accelerate.utils.modeling INFO     We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]

2024-10-30 11:15:35 src.models INFO     loaded model </home/local_arnab/Codes/00_MODEL/meta-llama/Llama-3.2-3B> | size: 6127.841 MB | dtype: torch.float16 | device: cuda:0





In [7]:
prompts = [
    "What is the capital of France?",
    "This is a",
    "Once upon a time",
]

inputs = mt.tokenizer(
            prompts,
            return_tensors="pt",
            padding="max_length",
            max_length=5,
            truncation=True
        )

inputs.attention_mask

tensor([[1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])

In [9]:
mt.tokenizer.decode(inputs.input_ids[0])  # Decode the first input id

'<|begin_of_text|>What is the capital'

In [3]:
from src.dataset_manager import DatasetManager
DatasetManager.list_datasets_by_group()

2024-10-29 23:10:26 numexpr.utils INFO     Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-10-29 23:10:26 numexpr.utils INFO     NumExpr defaulting to 8 threads.


2024-10-29 23:10:26 datasets INFO     PyTorch version 2.5.0 available.


{'geometry_of_truth': ['sp_en_trans',
  'cities',
  'neg_cities',
  'smaller_than',
  'larger_than',
  'common_claim_true_false',
  'companies_true_false'],
 'relations': ['commonsense/word_sentiment',
  'commonsense/fruit_outside_color',
  'commonsense/task_done_by_person',
  'commonsense/work_location',
  'commonsense/task_done_by_tool',
  'commonsense/substance_phase',
  'commonsense/object_superclass',
  'factual/country_capital_city',
  'factual/person_plays_pro_sport',
  'factual/country_language',
  'factual/country_largest_city',
  'factual/food_from_country',
  'factual/landmark_in_country',
  'factual/superhero_archnemesis',
  'factual/city_in_country',
  'factual/superhero_person',
  'factual/person_plays_instrument',
  'factual/country_currency',
  'factual/person_plays_position_in_sport',
  'linguistic/word_last_letter',
  'linguistic/verb_past_tense',
  'linguistic/word_first_letter',
  'linguistic/adj_superlative',
  'linguistic/adj_comparative',
  'linguistic/adj_antony

In [4]:
dataloader = DatasetManager.from_named_datasets(
    # [("geometry_of_truth", "cities")],
    # [("sst2", "sst2")],
    # [("relations", 'factual/country_capital_city')],
    # [("tense", "tense")],
    # [("language_identification", "language_identification")],
    # [("singular_plural", "singular_plural")],
    [("ag_news", "ag_news")],
    batch_size=5
)
batch = next(iter(dataloader))
batch

[ContextQASample(context="Spain, Lithuania Go 5-0 in Men's Hoops (AP)\n\nAP - Spain could have tanked its game against New Zealand, thereby eliminating European rival Serbia-Montenegro from medal contention. Instead, the Spaniards came to play Monday and won 88-84, knocking New Zealand out and keeping the defending world champions in medal contention.", questions=['# How relevant is this article to World News?', '# Would you classify this as an article about Business?', '# Does the content of this article pertain to Sports?', '# Does this writing concentrate on aspects of Business?', '# Would you say this piece focuses on Sports?', '# Does this text explore Science/Technology?', '# Is there a substantial focus on Sports in this article?', '# Can this be considered a Sports-related article?', '# Can this article be categorized under Sports?', '# Could this be summarized as an article about Sports?'], answers=['No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes']),
 ContextQA

In [40]:
from anthropic import Anthropic

def ask_claude(
    prompt: str,
) -> str:
    ##################################################
    client = Anthropic(
        api_key=os.getenv("CLAUDE_KEY"),
    )
    claude_model = "claude-3-5-sonnet-20241022"
    ##################################################

    response = client.messages.create(
        model=claude_model,
        max_tokens=7000,
        temperature=0,
        system="You are a helpful assistant.",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    }
                ],
            }
        ],
    )
    response = response.content[0].text
    return response

ask_claude("What is the capital of France?")

2024-10-29 21:31:48 httpx INFO     HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


'The capital of France is Paris. It is also the largest city in France and one of the most populous cities in Europe. Paris is known for its iconic landmarks such as the Eiffel Tower, the Louvre Museum, Notre-Dame Cathedral, and the Arc de Triomphe.'

In [41]:
prompt = """Give me a list of simple sentences in English that mentions either a single person or multiple people.
It should be in CSV format. Give me 300 examples.

sentece,n_subjects
Andy Garcia is an actor.,single
Andy and Gabrielle went to a party.,multiple
The children are playing in the park.,multiple
My grandparents are in town.,multiple
The pen name of Samuel Clemens is Mark Twain.,single
Mary's husband is a doctor.,single
"""
response = ask_claude(prompt)

2024-10-29 21:32:34 httpx INFO     HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


In [42]:
print(response)

Here's a CSV list of 300 sentences with single or multiple subjects:

sentence,n_subjects
Andy Garcia is an actor.,single
Andy and Gabrielle went to a party.,multiple
The children are playing in the park.,multiple
My grandparents are in town.,multiple
The pen name of Samuel Clemens is Mark Twain.,single
Mary's husband is a doctor.,single
John is reading a book.,single
Sarah and Mike are dancing.,multiple
The teacher is writing on the board.,single
Students are taking their exam.,multiple
My sister lives in Paris.,single
Tom and Jerry are cartoon characters.,multiple
The baby is sleeping.,single
The twins are identical.,multiple
David plays the guitar.,single
Emma and Olivia are best friends.,multiple
The president gave a speech.,single
The team won the championship.,multiple
My brother works at the bank.,single
The couple is getting married.,multiple
Peter is tall.,single
Jack and Jill went up the hill.,multiple
The musician performed brilliantly.,single
The choir sang beautifully.,mul

In [43]:
from src.dataset_manager import DatasetLoader, ContextQASample, NUM_QA_PER_SAMPLE, YES_TOKEN, NO_TOKEN
import os
from src.utils import env_utils
import json
import random
from datasets import load_dataset
import pandas as pd

# from src.dataset_manager import TenseDatasetLoader
# from src.dataset_manager import LanguageIDDatasetLoader
from src.dataset_manager import SingularPluralDatasetLoader

loader = SingularPluralDatasetLoader()
ds = loader.load()

In [44]:
ds[:5]

[ContextQASample(context='Andy Garcia is an actor.', questions=['# Is this text discussing more than one individual?', '# Does this statement indicate a plural subject?', '# Is this about more than one person?', '# Is this sentence referring to one individual?', '# Would you say this sentence mentions multiple people?', '# Is the reference here to one individual?', '# Can we interpret this as referring to a single entity?', '# Does this passage concern a single individual?', '# Does this passage concern a single individual?', '# Is the reference here to more than one individual?'], answers=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No']),
 ContextQASample(context='Andy and Gabrielle went to a party.', questions=['# Is this about one person?', '# Can we interpret this as referring to a single entity?', '# Is this about one person?', '# Am I correct in saying this is about multiple people?', '# Is this sentence referring to more than one individual?', '# Is the referenc

In [51]:
batch[2].__dict__

{'context': 'The carriers completed routes.',
 'questions': ['# Does this passage concern a single individual?',
  '# Is this statement about one entity?',
  '# Does this sentence focus on a single person?',
  '# Would you say this sentence mentions multiple people?',
  '# Is this narrative about a single subject?',
  '# Does this statement indicate a plural subject?',
  '# Is this text discussing more than one individual?',
  '# Does this text feature multiple individuals?',
  '# Does this statement indicate a singular subject?',
  '# Is the reference here to one individual?'],
 'answers': ['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']}

In [55]:
list(DatasetManager.list_datasets_by_group().keys())

['geometry_of_truth',
 'relations',
 'sst2',
 'md_gender',
 'ag_news',
 'ner',
 'tense',
 'language_identification',
 'singular_plural']