In [44]:
import together
import os
import tqdm
import smart_open
from dolma.core.paths import glob_path
import random
import json
import copy
import openai
from collections import Counter

random.seed(42)
# client = together.Together(api_key=os.environ.get("TOGETHER_API_KEY"))
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
base_path = "s3://ai2-llm/pretraining-data/sources/cccc/v2/documents/*/*.gz"
all_paths = list(glob_path(base_path))
random.shuffle(all_paths)

# response = client.chat.completions.create(
#     model="mistralai/Mixtral-8x7B-Instruct-v0.1",
#     messages=[{"role": "user", "content": "tell me about new york"}],
# )
# print(response.choices[0].message.content)

In [39]:
prompt = """
Given the following HTML snippet enclosed in ```quotes```, respond IMAGE if the Creative Common license in it refers to an image, TEXT if it refers to text. DO NOT return any text besides IMAGE or TEXT.

```html
{snippet}
```
"""

In [31]:
# Grabbing documents
documents = []
per_snapshot = 100
total_snapshots = 100

with tqdm.tqdm(desc="files", position=0) as tf, tqdm.tqdm(desc="lines", position=1) as tl:
    for j, path in enumerate(all_paths):
        with smart_open.open(path) as f:
            for i, line in enumerate(f):
                documents.append(json.loads(line))
                tl.update(1)
                if i >= per_snapshot:
                    break
        tf.update(1)
        if j >= total_snapshots:
            break

lines: 10201it [01:11, 141.75it/s]
files: 101it [01:11,  1.40it/s]


In [None]:
responses = []
# model_name = "meta-llama/Llama-3-70b-chat-hf"
model_name = "gpt-4o"
print(prompt)

for doc in tqdm.tqdm(documents):
    extracted_licenses = {
        license_name: license_snippet[0].strip()
        for license_name, license_snippet in doc["metadata"]["attribute_spans"].items()
        if "copyright" not in license_name and isinstance(license_snippet, list)
    }

    labeled_licenses = {}
    for license_name, license_snippet in extracted_licenses.items():
        response = client.chat.completions.create(
            model=model_name,
            temperature=0.2,
            messages=[{"role": "user", "content": prompt.format(snippet=license_snippet)}],
        )
        labeled_licenses[license_name] = {
            "label": response.choices[0].message.content,
            "snippet": license_snippet,
            "model": model_name,
        }

    doc_with_labels = copy.deepcopy(doc)
    doc_with_labels["metadata"]["labeled_licenses"] = labeled_licenses
    # print(json.dumps({'url': doc_with_labels['metadata']['warc_url'], 'licenses': labeled_licenses}, indent=2))
    responses.append(doc_with_labels)

In [46]:
license_counter = Counter()
for page in responses:
    license_counter.update([label["label"] for label in page["metadata"]["labeled_licenses"].values()])

print(license_counter)

Counter({'TEXT': 4268, 'IMAGE': 2398})
