In [1]:
import os
import openai
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

# Set up your OpenAI API key
api_key = os.getenv('API_KEY')

print(api_key)
sample_size_train = 1000
sample_size_val = 200
sample_size_test = 200


  from .autonotebook import tqdm as notebook_tqdm


sk-proj-mIJ5Km0WKFJrzMmMEFltT3BlbkFJ8P7EYaTy1VWxyq4hAeHG


In [5]:
# Load CONLL Dataset
data = load_dataset('conll2003')

In [2]:
system_message = {
    "role": "system",
    "content": (
        "You are an expert in document classification. "
        "Classify the provided text into one of these categories by responding with only the category number: "
        "0 for World, 1 for Sport, 2 for Business, 3 for Technology, 4 for Other. "
        "Choose the most relevant category if the text fits into multiple categories. "
        "Do not provide any additional information or explanation in your response."
    )
}

In [20]:
def get_document_label(text):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[system_message,{"role": "user", "content": f"{text}"}],
            max_tokens = 1, # This is added to avoid a buggy non-digit response costing tokens 
            temperature = 0.0 # We want deterministic responses
        )
        label = response.choices[0].message['content']
        tokens_used = response['usage']['total_tokens']
        return label, tokens_used
    except Exception as e:
        print(f"Error in API call: {e}")
        return None, 0


In [18]:
def augment_dataset(dataset, type='train', sample_size=1000):
    dataset = dataset.shuffle(seed=42).select(range(sample_size))
    
    augmented_data = []
    total_tokens = 0

    for sentence in tqdm(dataset, desc=f"Augmenting the {type} set"):
        sentence_txt = " ".join(sentence['tokens'])
        print(sentence_txt)
        sentence_label, tokens_used = 0,1 #get_document_label(sentence_txt)
        total_tokens += tokens_used
        if sentence_label is not None:
            augmented_data.append({
                'tokens': sentence['tokens'],
                'ner_tags': sentence['ner_tags'],
                'sentence_label': sentence_label
            })
    
    print(f"Total tokens used for {type} set: {total_tokens}")
    return pd.DataFrame(augmented_data), total_tokens


In [19]:

# Augment datasets
train_augmented = augment_dataset(data['train'], 'train', sample_size=sample_size_train)  # Adjust sample size as needed
val_augmented = augment_dataset(data['validation'], 'validation', sample_size=sample_size_val)
test_augmented = augment_dataset(data['test'], 'test', sample_size=sample_size_test)

# Save augmented datasets
train_augmented, train_tokens = augment_dataset(data['train'], 'train', sample_size=sample_size_train)  # Adjust sample size as needed
val_augmented, val_tokens = augment_dataset(data['validation'], 'validation', sample_size=sample_size_val)
test_augmented, test_tokens = augment_dataset(data['test'], 'test', sample_size=sample_size_test)


print("Dataset augmentation complete. Files saved as CSV.")
print(f"Total tokens used = {train_tokens + val_tokens + test_tokens}, approximate price: {0.00050 * (train_tokens + val_tokens + test_tokens)/1000} assuming gpt-3.5-turbo-0125 on 02.07.2024")

Augmenting the train set: 100%|██████████| 1000/1000 [00:00<00:00, 6286.78it/s]


" Neither the National Socialists ( Nazis ) nor the communists dared to kidnap an American citizen , " he shouted , in an oblique reference to his extradition to Germany from Denmark . "
TUNIS 1996-08-22
Werder Bremen 3 0 1 2 4 6 1
Heavy fighting broke out between two rival Kurdish factions in northern Iraq at midnight Sunday and at least 29 people were killed , one of the groups said on Monday .
LONDON 1996-08-28
-- Jonathan Birt , London Newsroom +44 171 542 7717
New York Dow Jones industrial average -- 5,778.00 ( May 22/96 )
Officials from the Organisation for Security and Cooperation in Europe ( OSCE ) are considering the postponement following allegations of serious irregularities in the registration of Serb refugees .
It said the KDP was responsible for breaking the previous ceasefire by refusing to endorse it publicly .
Smirnov said the Olympic Committee might ask the government to take measures to protect the country 's best athletes , some of whom have already chosen to live a

Augmenting the validation set:   0%|          | 0/200 [00:00<?, ?it/s]

Pro-Moscow leaders in Chechnya have criticised Tim Guldimann , the Swiss diplomat who heads the OSCE Chechnya mission , saying he was biased toward Zelimkhan Yandarbiyev , president of the self-declared separatist government .
That followed a revised 0.7 percent decline in June orders .
SEOUL 1996-08-30
Notts County 3 1 1 1 2 2 4
Women 's 100 metres
The world 's costliest footballer Alan Shearer was named as the new England captain on Friday .
The September bond future on LIFFE was trading at 115.45 , down 0.13 from Thursday 's settlement price .
" They could cause serious damage as much as 500 meters ( yards ) away from wherever they were detonated , " the spokesman added .
Extras ( lb-3 nb-6 w-7 ) 16
W D L PCT GB
Singapore hanged a Thai farmer at Changi Prison on Friday for drug trafficking , the Central Narcotics Bureau ( CNB ) said .
" There is no doubt that Chechnya , according to OSCE principles , belongs to a state called Russia , " he said , pointing out that Russia was an OSCE

Augmenting the validation set: 100%|██████████| 200/200 [00:00<00:00, 5461.80it/s]


" Then , with cool heads , calmly and soberly we will sort out our relations , " Lebed said after the late-night signing ceremony in this settlement outside Chechnya 's eastern border .
CHICAGO 1996-08-30
It fires me up , makes me play my best tennis , " Tarango said .
Britain condemns Iraq involvement in Arbil attack .
Payrolls of manufacturing companies rose in July by $ 2.3 billion to an annual rate of $ 678 billion .
Kent will also need to keep their nerve against struggling Nottinghamshire who will enter the final day 137 ahead with four wickets left in a relatively low-scoring match at Tunbridge Wells .
President Clinton , Bob Dole and Ross Perot are hitting the road now that the partying is over , and people who have billions of dollars invested in stocks were bracing for political promises that could have an impact on their wealth .
China cities to ban disposable plastic containers .
Note - Figures are unadjusted , in billions of dollars .
Ince was clambering over a wall at the

Augmenting the test set: 100%|██████████| 200/200 [00:00<00:00, 6692.36it/s]


Hartford 4 BOSTON 2
S. Doull c subs ( M. Wasim ) b Waqar 1
Camilla Martin ( Denmark ) beat Wang Chen ( China ) 11-0 12-10
Swiss skiers occupied the other two places on the podium , Karin Kuster taking second with 160.55 narrowly ahead of Evelyne Leu with 160.36 .
Montpellier 20 3 9 8 17 24 18
Winds from the northeast at 10 to 15 knots ( 19 to 28 kilometers / 11 to 17 miles per hour ) .
Cambridge United 0 Woking 2
Crude petroleum 2,557 2,832 15,838 17,648
SOCCER - LEADING SCOTTISH PREMIER DIVISION SCORERS .
Mongolia 's state copyright official , Gundegma Jargalshaihan , said apologetically that he had just arrived from Ulan Bator and was not aware of the details of the digital agenda .
LONDON 1996-12-07
" Internet is a potential cash cow for copyright-based industries and we need roadmaps on the information superhighway , " said Marc Pearl , vice-president of the Information Technology Association of America , a trade association of U.S. network companies opposing the treaties .
Feb 97 

Augmenting the train set:  50%|████▉     | 495/1000 [00:00<00:00, 4943.63it/s]

" Neither the National Socialists ( Nazis ) nor the communists dared to kidnap an American citizen , " he shouted , in an oblique reference to his extradition to Germany from Denmark . "
TUNIS 1996-08-22
Werder Bremen 3 0 1 2 4 6 1
Heavy fighting broke out between two rival Kurdish factions in northern Iraq at midnight Sunday and at least 29 people were killed , one of the groups said on Monday .
LONDON 1996-08-28
-- Jonathan Birt , London Newsroom +44 171 542 7717
New York Dow Jones industrial average -- 5,778.00 ( May 22/96 )
Officials from the Organisation for Security and Cooperation in Europe ( OSCE ) are considering the postponement following allegations of serious irregularities in the registration of Serb refugees .
It said the KDP was responsible for breaking the previous ceasefire by refusing to endorse it publicly .
Smirnov said the Olympic Committee might ask the government to take measures to protect the country 's best athletes , some of whom have already chosen to live a

Augmenting the train set: 100%|██████████| 1000/1000 [00:00<00:00, 5157.94it/s]


Wrexham 2 0 2 0 5 5 2
Gencor swells profit despite setbacks .
State and federal agents on Thursday sifted through the rubble of two predominantly black Arkansas churches that burned within minutes of one another late Tuesday and early Wednesday .
( A. Brown 56 not out ) .
Struggling French first division side Nice on Thursday announced they were parting with coach Albert Emon after a string of poor results .
Naish said there was no need for Britain to carry out a planned cull of some 147,000 cattle to which it had reluctantly agreed to placate its European partners .
N. Hussain lbw b Mushtaq Ahmed 51
Ijaz Ahmed c Stewart b Mullally 61
3. Margaret Crowley ( Australia ) 2:02.40
Leading overall placings after three stages :
Bochum 1 ( Jack 66th minute ) Arminia Bielefeld 1 ( Molata 59th ) .
Still , the feisty Indonesian got off to a 3-0 lead in the tie-breaker before a pair of costly double faults gave Graf her chance to avoid a third set .
She is scheduled to step down in January after a

Augmenting the validation set: 100%|██████████| 200/200 [00:00<00:00, 4920.67it/s]


Pro-Moscow leaders in Chechnya have criticised Tim Guldimann , the Swiss diplomat who heads the OSCE Chechnya mission , saying he was biased toward Zelimkhan Yandarbiyev , president of the self-declared separatist government .
That followed a revised 0.7 percent decline in June orders .
SEOUL 1996-08-30
Notts County 3 1 1 1 2 2 4
Women 's 100 metres
The world 's costliest footballer Alan Shearer was named as the new England captain on Friday .
The September bond future on LIFFE was trading at 115.45 , down 0.13 from Thursday 's settlement price .
" They could cause serious damage as much as 500 meters ( yards ) away from wherever they were detonated , " the spokesman added .
Extras ( lb-3 nb-6 w-7 ) 16
W D L PCT GB
Singapore hanged a Thai farmer at Changi Prison on Friday for drug trafficking , the Central Narcotics Bureau ( CNB ) said .
" There is no doubt that Chechnya , according to OSCE principles , belongs to a state called Russia , " he said , pointing out that Russia was an OSCE

Augmenting the test set: 100%|██████████| 200/200 [00:00<00:00, 5237.12it/s]

Hartford 4 BOSTON 2
S. Doull c subs ( M. Wasim ) b Waqar 1
Camilla Martin ( Denmark ) beat Wang Chen ( China ) 11-0 12-10
Swiss skiers occupied the other two places on the podium , Karin Kuster taking second with 160.55 narrowly ahead of Evelyne Leu with 160.36 .
Montpellier 20 3 9 8 17 24 18
Winds from the northeast at 10 to 15 knots ( 19 to 28 kilometers / 11 to 17 miles per hour ) .
Cambridge United 0 Woking 2
Crude petroleum 2,557 2,832 15,838 17,648
SOCCER - LEADING SCOTTISH PREMIER DIVISION SCORERS .
Mongolia 's state copyright official , Gundegma Jargalshaihan , said apologetically that he had just arrived from Ulan Bator and was not aware of the details of the digital agenda .
LONDON 1996-12-07
" Internet is a potential cash cow for copyright-based industries and we need roadmaps on the information superhighway , " said Marc Pearl , vice-president of the Information Technology Association of America , a trade association of U.S. network companies opposing the treaties .
Feb 97 


