In [1]:
import cohere
from dotenv import load_dotenv
import os
import json
from tqdm import tqdm
import pandas as pd
import time
load_dotenv()

True

In [14]:
with open('data/News_Category_Dataset_v3.json', 'r') as f:
    lines = f.readlines()
data = pd.DataFrame([json.loads(line) for line in lines])

# Drop rows with empty short_description
data = data.dropna(subset=['short_description'])
data = data[data.short_description.apply(lambda x: len(x) > 10)].copy()

In [15]:
path_examples = "data/train.json"
with open(path_examples, "r") as f:
  examples_raw = json.load(f)


cohere_examples = []
for label, texts in examples_raw.items():
    for text in texts:
        cohere_examples.append(cohere.ClassifyExample(text=text, label=label))

In [16]:

def parse_response(response):
    """
    Response looks like:
id='e1ba0613-6314-489d-81f5-79e34a0ba446' 
classifications=[
    ClassifyResponseClassificationsItem(
        id='1c5c5c48-6d2c-4faa-959e-41793fd44bad', 
        input='The role of credit scores in lending decisions is significant.\n', 
        prediction='Finance', 
        predictions=['Finance'], 
        confidence=0.4602186, 
        confidences=[0.4602186], 
        labels={
            'Education': ClassifyResponseClassificationsItemLabelsValue(confidence=0.0662585), 
            'Entertainment': ClassifyResponseClassificationsItemLabelsValue(confidence=0.0324937), 
            'Environment': ClassifyResponseClassificationsItemLabelsValue(confidence=0.044312477), 
            'Fashion': ClassifyResponseClassificationsItemLabelsValue(confidence=0.00911254), 
            'Finance': ClassifyResponseClassificationsItemLabelsValue(confidence=0.4602186), 
            'Food': ClassifyResponseClassificationsItemLabelsValue(confidence=0.012097831), 
            'Health': ClassifyResponseClassificationsItemLabelsValue(confidence=0.03673331), 
            'Politics': ClassifyResponseClassificationsItemLabelsValue(confidence=0.032481745), 
            'Science': ClassifyResponseClassificationsItemLabelsValue(confidence=0.022289895), 
            'Sports': ClassifyResponseClassificationsItemLabelsValue(confidence=0.03476186), 
            'Technology': ClassifyResponseClassificationsItemLabelsValue(confidence=0.22282991), 
            'Travel': ClassifyResponseClassificationsItemLabelsValue(confidence=0.026409639)
        }, 
        classification_type='single-label')
    
    __OUTPUT__:
        Records with fields: 
            "input": str, 
            "prediction":str, 
            "confidence_prediction":float, 
            "labels":list(str), 
            "confidence_labels":list(float)
    """
    data = []
    for classification in response.classifications:
        data.append({
            "input": classification.input,
            "prediction": classification.prediction,
            "confidence_prediction": classification.confidence,
            "labels": list(classification.labels.keys()),
            "confidence_labels": [value.confidence for value in classification.labels.values()]
        })
    
    return data



In [21]:


def cohere_classification(batch_size, rate_limit, checkpointing_folder_path, output_file_path, inputs, cohere_examples):

    ## Check if the folder exists
    if not os.path.exists(checkpointing_folder_path):
        os.makedirs(checkpointing_folder_path)
        last_batch = 0
    else:
        ## Find the last batch
        files = os.listdir(checkpointing_folder_path)
        last_batch = max([int(file.split("_")[-1].split(".")[0]) for file in files])
        print(f"Resuming from batch {last_batch}")

    
    ## Initialize the client
    co = cohere.Client(os.getenv("COHERE_API_KEY"))

    ## Split the inputs into batches
    batches_list = [inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)]

    ## Initialize the variables
    current_nb_api_calls = 0
    start_time = time.time()
    file_names = []
    i = last_batch

    for batch in tqdm(batches_list[last_batch:]):
        i += 1
        ## API call
        response = co.classify(
            inputs=batch,
            examples=cohere_examples,
        )
        current_nb_api_calls += 1
        ## Parse the response
        data = parse_response(response)

        ## Save the data
        file_path = checkpointing_folder_path + f"test_batch_{i}.json"
        with open(file_path, "w") as f:
            json.dump(data, f)
        file_names.append(file_path)


        ## Sleep if rate limit reached
        current_rate = 60 * current_nb_api_calls / (time.time() - start_time)
        if current_rate >= rate_limit:
            time_to_sleep = 60 
            print(f"Rate limit reached, sleeping for {time_to_sleep} seconds")
            time.sleep(time_to_sleep)
            current_nb_api_calls = 0
            start_time = time.time()

    ## List files in the checkpointing folder
    files = os.listdir(checkpointing_folder_path)

    ## Gather all the results
    all_results = []
    for file in files:
        with open(checkpointing_folder_path + file, "r") as f:
            data = json.load(f)
            all_results += data
    
    ## Save the results
    with open(output_file_path, "w") as f:
        json.dump(all_results, f)

    

In [23]:
checkpointing_folder_path = "data/checkpoints/"
output_file_path = "data/train_from_news_dataset.json"

## Gather all the results
all_results = []
files = os.listdir(checkpointing_folder_path)
for file in files:
    with open(checkpointing_folder_path + file, "r") as f:
        data = json.load(f)
        all_results += data

## Save the results
with open(output_file_path, "w") as f:
    json.dump(all_results, f)

In [24]:
len(all_results)

49840

In [22]:

rate_limit = 90
batch_size = 80
check_pointing_folder_path = "data/checkpoints/"
output_file_path = "data/train_from_news_dataset.json"
inputs = data["short_description"].tolist()

cohere_classification(batch_size, rate_limit, check_pointing_folder_path, output_file_path, inputs, cohere_examples)

 26%|██▋       | 623/2354 [15:54<44:10,  1.53s/it]   


TooManyRequestsError: status_code: 429, body: {'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}

In [3]:
with open("data/test_all.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame({"label": [d["prediction"] for d in data],"text": [d["input"] for d in data]})

In [5]:
df.to_csv("data/test_all.csv", index=False)