In [1]:
import pandas as pd

df_small = pd.read_csv('data.csv')

* For this to work, we need to process big batches (100 lines)
* So fine tuning must also use this format
* or can we make parallel api calls?

In [4]:
from openai import OpenAI
import os
import dotenv
import numpy as np
import concurrent.futures

import time

#load .env file
dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def predict_batches(texts, model, batch_size=40):
    '''
    Take a list of strings, predict the labels and return a list of predictions 
    '''
    predictions = {}
    for i in range(0, len(texts), batch_size):
        print(i)
        print(i)
        print(i)
        batch = texts[i:i+batch_size]
        
        combined_string = '\n'.join([str(j+1) + '. ' + '$$$' +text + '$$$' for j, text in enumerate(batch)])
    
        completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system", 
                "content": """Du får udleveret mange tekststykke. Du skal vurdere, om hver tekst er skrevet af et menneske eller en AI. Du skal svare 0, hvis du tror, ​​det er skrevet af et menneske, og 1, hvis du tror, ​​det er skrevet af en AI. Kun ét ciffer per text er tilladt. Du vil få input som er afmærket med et tal og 3 dollartegn ($$$) i starten og slutningen. Der skal til mellem hvert svar være et linjeskift (\n).
                
                Eksempel:
                1. $$$Tekst 1 er en lang text ....
                den dækker flere linjer
                Og har alverdens tegn i sig
                $$$
                2. $$$Tekst 2$$$

                3. $$$Tekst 3$$$
                ....
                Og du skal svare med:
                1. 0
                2. 1
                3. 0
                ...."""},
            {"role": "user", "content": combined_string}
        ]
        )
        print(completion.choices[0].message.content.split('\n'))
        predictions_batch = completion.choices[0].message.content.split('\n')
        for j in range(len(batch)):
            try:
                num = int(predictions_batch[j].split('. ')[0])
            except:
                num += 1
                print('Error getting text number', i, 'with text:', texts[i])
            
            try:
                
                predictions[int(num)-1+i] = '. '.join(predictions_batch[j].split('. ')[1:])
            except:
                print('Error getting pred', i, 'with text:', texts[i])

    print(predictions)
        
    
    pred_clean = []
    for i in range(len(texts)):
        if i not in predictions.keys():
            pred_clean.append(-1)
            print('Missing prediction for text number', i, 'with text:', texts[i])

        else:
            try:
                pred_clean.append(int(predictions[i]))
            except:
                pred_clean.append(-1)
                print('Cleaning failed for text number', i, 'with text:', texts[i])
    # convert to int
    pred_clean = [int(x) for x in pred_clean]
    return pred_clean

def predict(text, model="ft:gpt-3.5-turbo-0613:personal::8KV9rODr"):
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "Du får udleveret et tekststykke. Du skal vurdere, om teksten er skrevet af et menneske eller en AI. Du skal svare 0, hvis du tror, ​​det er skrevet af et menneske, og 1, hvis du tror, ​​det er skrevet af en AI. Kun ét ciffer er tilladt."},
                {"role": "user", "content": text}
            ]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print("Rate limit reached, sleeping for 30 milliseconds")
        time.sleep(0.03)
        return predict(text, model)  # Retry the request

def predict_parallel(texts):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(predict, texts))
    return results

def clean_up(x):
    if x == '0' or x == 0:
        return 0

    if x == '1' or x == 1:
        return 1

    return 1

# Example texts to be analyzed
texts = df_small['text'].values.tolist()

pred = predict_parallel(texts)

pred_clean = [clean_up(x) for x in pred]
pred_clean

[0, 1, 0, 1, 1, 1, 0, 1, 1, 0]

## Fine-tuning GPT

In [29]:
import numpy as np
# load texts and labels
def text_edit(text):
    # remove first two lines
    text = '\n'.join(text.split('\n')[2:])

    # cut longer than 2000 characters
    text = text[:2000]

    # # remove everything After Kilde:
    # text = text.split('Kilde:')[0]

    # remove empty lines
    # text = '\n'.join([line for line in text.split('\n') if line != ''])

    # remove new lines
    text = ' '.join(text.split('\n'))

    # remove double spaces
    # text = ' '.join(text.split('  '))

    # remove emojis but keep danish letters
    # text = ''.join([i if ord(i) < 128 else ' ' for i in text])

   

    # Assuming 'text' is defined and contains sentences
    sentences = text.split('.')
    num_sentences = len(sentences)
    if num_sentences > 4:

        # Create an exponential distribution for probabilities
        # The exponential distribution should favor lower indices for rand_low and higher indices for rand_high
        probabilities = np.exp(np.linspace(0, 2, num_sentences-1))
        probabilities /= probabilities.sum()  # Normalize to make it a valid probability distribution

        # Choose rand_low and rand_high using the defined probabilities
        rand_low = np.random.choice(np.arange(num_sentences-1), p=probabilities[::-1])
        rand_high = np.random.choice(np.arange(rand_low+1, num_sentences), 
                        p=probabilities[rand_low:] / probabilities[rand_low:].sum())

        # Join the selected range of sentences
        text = '.'.join(sentences[rand_low:rand_high])

    return text

def load_data():

    import os

    data = {
        'human': [],
        'bot': []
    }

    data_sources = {
        'human' : ['data/heste-nettet-nyheder/'],
        'bot' : ['data/heste-nettet-nyheder-ai/gpt-3.5-turbo/', 'data/heste-nettet-nyheder-ai/gpt-4-0613/']
    }

    for source in data_sources:
        for path in data_sources[source]:
            for filename in os.listdir(path):
                with open(path + filename, 'r', encoding='utf-8') as f:
                    content = f.read()
                    text = text_edit(content)
                    data[source].append(text)

    # cut to same length
    min_len = min(len(data['human']), len(data['bot']))
    data['human'] = data['human'][:min_len]
    data['bot'] = data['bot'][:min_len]
    
    # my_texts = np.array(data['human'] + data['bot'])
    # my_labels = np.array([0]*len(data['human']) + [1]*len(data['bot'])) 
    
        

    return data['human'], data['bot']

data_human, data_machine = load_data()

# save to json
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
import json

with open('data.jsonl', 'w') as f:
    for text in data_human:
        json_line = json.dumps({"messages": [
            {"role": "system", "content": "Du får udleveret et tekststykke. Du skal vurdere, om teksten er skrevet af et menneske eller en AI. Du skal svare 0, hvis du tror, ​​det er skrevet af et menneske, og 1, hvis du tror, ​​det er skrevet af en AI. Kun ét ciffer er tilladt."}, 
            {"role": "user", "content": text},
            {"role": "assistant", "content": "0"}
        ]})
        f.write(json_line + '\n')

    for text in data_machine:
        json_line = json.dumps({"messages": [
            {"role": "system", "content": "Du får udleveret et tekststykke. Du skal vurdere, om teksten er skrevet af et menneske eller en AI. Du skal svare 0, hvis du tror, ​​det er skrevet af et menneske, og 1, hvis du tror, ​​det er skrevet af en AI. Kun ét ciffer er tilladt."}, 
            {"role": "user", "content": text},
            {"role": "assistant", "content": "1"}
        ]})
        f.write(json_line + '\n')

# # Upload to OpenAI
# from openai import OpenAI
# client = OpenAI()

# client.files.create(
#     file=open("data.jsonl", "rb"),
#     purpose="fine-tune"
# )


FileObject(id='file-BIu87zRI6FM0pZT1X5O4fU8t', bytes=882964, created_at=1699891778, filename='data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [30]:
# from openai import OpenAI
# client = OpenAI()

# client.fine_tuning.jobs.create(
#   training_file="file-BIu87zRI6FM0pZT1X5O4fU8t",
#     model="gpt-3.5-turbo-1106",
# )

FineTuningJob(id='ftjob-rwP7gCpnDRINR6XU76z4FY7i', created_at=1699891790, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-87Ypq9qoCRxP9EESHWe2tRdz', result_files=[], status='validating_files', trained_tokens=None, training_file='file-BIu87zRI6FM0pZT1X5O4fU8t', validation_file=None)

## Apply to val set

In [8]:
df = pd.read_csv('data/val.csv')
df.head()

Unnamed: 0,text
0,"I min optik er god forskning det samme som """"s..."
1,"Jeg synes, der er rigtig gode erstatninger på ..."
2,"Selvom der er tale om en sag, der har været i ..."
3,"Vi er ikke i en storkonflikt lige nu, men det ..."
4,Ja! Det er helt horribelt at der fortsat sidde...


In [9]:
# use fine-tuned model
model_finetuned= "ft:gpt-3.5-turbo-0613:personal::8KV9rODr"


pred = predict_parallel(df['text'].values.tolist())
pred_clean = [clean_up(x) for x in pred]
pred_clean

Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, sleeping for 30 milliseconds
Rate limit reached, 

[1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,


In [11]:
df['prediction_finetuned'] = pred_clean
df['prediction_finetuned'].value_counts()

prediction_finetuned
1    737
0    345
Name: count, dtype: int64

In [13]:
# save to npz


np.savez('prediction_finetuned.npz', pred=pred_clean)
# load df['pred']
labels = np.load('prediction_finetuned.npz', allow_pickle=True)
labels['pred'].tolist()

[1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
