# Test the connection and API key

Make sure it's possible to use the OpenAI API. For this to work, the environment variable OPENAI_API_KEY must be set to a valid API key which has available credits.

In [2]:
import os
from openai import AzureOpenAI


# create client object for accessing API
client = AzureOpenAI(
  azure_endpoint = "https://kk-gpt-development.openai.azure.com/",
  api_key=os.getenv("AZURE_OPENAI_KEY"),
  api_version="2024-02-15-preview"
)

# model = "natlibfi-gpt35-turbo-16k"
model = "natlibfi-gpt4-32k"

# test the API connection by making a simple request
response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Say this is a test."},
    ],
    temperature=0, max_tokens=7)
print(response)
print(response.choices[0].message.content)

ChatCompletion(id='chatcmpl-8xZB8VpjjnfIogwqNwiFNfyT1ABWt', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='This is a test.', role='assistant', function_call=None, tool_calls=None), content_filter_results={'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, logprobs=None)], created=1709207270, model='gpt-4-32k', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=5, prompt_tokens=23, total_tokens=28), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}])
This is a test.


# Get examples to be given to the model context

In [8]:
import glob
import json


dataset_train_files = glob.glob("../../llm-dataset/*-train.jsonl")
train_records = []
for train_set_file in dataset_train_files:
    with open(train_set_file) as trainfile:
        train_records.extend(
            [json.loads(line) for line in trainfile])
    continue  # Read just one record from each file

examples = []
for rec in train_records:
    print(rec["content"]["pdfinfo"], rec['content']['pages'][0]['text'])
    examples.append((rec["content"]["pdfinfo"], rec['content']['pages'][0]['text']))


{'title': 'Title', 'author': 'Name', 'creationDate': "D:20220611120145+03'00'", 'modDate': "D:20220611120146+03'00'"} SUSANNE OLSSON and JONAS SVENSSON ‘One of the most important questions that human beings have to understand’ Salafism as Islamic deferentialist fundamentalism DOI: https://doi.org/10.30664/ar.112804 Attribution 4.0 International (CC BY 4.0) n the present article, the authors argue that the Adraoui 2020). While Salafism has been study of Salafism as a contemporary Islamic firmly established as a descriptive term in new religious movement could benefit from the scholarly study of Islam (WeissmannIan analytical perspective separating fundamen­ talism into the modes of inferentialism and def­ 2017), it also serves as a self-designation erentialism. The basics of these concepts are by members of this movement, denoting a outlined and discussed in relation to different self-perceived, strict and detailed imitation aspects of contemporary Salafism as well as in of the ways of 

In [4]:
task_prompt = """
Your task is to extract bibliographic metadata from the given publication.
Use Dublin Core metadata scheme. You will first be provided examples of the
publications and their metadata. Give only the metadata, no explanations.

"""
examples_prompt = ""
for ind, ex in enumerate(examples[:5]):
    examples_prompt += f"This is example document number {ind}:\n"
    examples_prompt += ex[1]
    examples_prompt += "\nThis the bibliographic metadata of example document number {ind}:\n"
    examples_prompt += ex[0]
    examples_prompt += "\n---\n"


system_prompt = task_prompt + examples_prompt
print(system_prompt)


Your task is to extract bibliographic metadata from the given publication.
Use Dublin Core metadata scheme. You will first be provided examples of the
publications and their metadata. Give only the metadata, no explanations.

This is example document number 0:
creationDate: D:20200302142610+02'00'
modDate: D:20200302142632+02'00'

Please note! This is a self-archived version of the original article.
Huom! Tämä on rinnakkaistallenne.
To cite this Article / Käytä viittauksessa alkuperäistä lähdettä:
Antila, H. & Lähteenmäki, E. (2019) Virtain metsä- ja liiketalouden opiskelijat pohtivat yhteistyömahdollisuuksia. TAMK-blogi, 12.11.2019.
URL: https://blogs.tuni.fi/tamkblogi/teema2/virtain-metsa-ja-liiketalouden-opiskelijat-pohtivat- yhteistyomahdollisuuksia/
TAMPEREEN AMMATTIKORKEAKOULU
Kuntokatu 3, 33520 Tampere www.tuni.fi/tamk | p. 0294 5222
Virtain metsä- ja liiketalouden opiskelijat pohtivat yhteistyömahdollisuuksia 12.11.2019 — antilhe “Jos me esittelisimme messuilla metsätaloutta -

In [5]:
# Try out the model on a random test set record

import random

def get_completions(text):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text},
        ],
        temperature=0, max_tokens=700)
    return response.choices[0].message.content.strip()


dataset_test_files = glob.glob("../../llm-dataset/*-test.jsonl")
test_set_file = random.choice(dataset_test_files)

with open(test_set_file) as testfile:
    records = [json.loads(line) for line in testfile]
rec = random.choice(records)

print(f"Testing on {rec['id']} with PDF {rec['url']}")
print("---")
print("Curated metadata:")
print(rec["metadata"])
print("---")
print("Generated metadata:")
print(get_completions(rec["text"]))

Testing on https://taju.uniarts.fi/handle/10024/6500 with PDF https://taju.uniarts.fi/bitstream/handle/10024/6500/nbnfife201603098692.pdf
---
Curated metadata:
Author: Korhonen-Björkman, Heidi
Organization: Taideyliopiston Sibelius-Akatemia
Issued: 2016
ISBN (online): 978-952-68347-2-6
URN: URN:ISBN:978-952-68347-2-6
Language: swe
Publisher: Taideyliopiston Sibelius-Akatemia
ISBN (printed): 978-952-68347-1-9
Series name: Acta Musicologica Fennica
Number in series: 33
ISSN (printed): 0587-2448
Degree program: MuTri
Title: Musikerröster i Betsy Jolas musik : dialoger och spelerfarenheter i analys
COAR type: doctoral thesis
OKM type: G4 Monografiaväitöskirja
Thesis level: Monografiaväitöskirja
---
Generated metadata:


Author: Korhonen-Björkman, Heidi
Organization: Sibelius-Akademin vid Konstuniversitetet
Issued: 2016
ISBN (printed): 978-952-68347-1-9
ISBN (online): 978-952-68347-2-6
URN: URN:ISBN:978-952-68347-2-6
Language: swe
Publisher: Juvenes Print
ISSN (printed): 0587-2448
Series name: Acta Musicologica Fennica
Number in series: 33
Series year: 2016
Title: MUSIKERRÖSTER I BETSY JOLAS MUSIK – dialoger och spelerfarenheter i analys
COAR type: doctoral thesis
OKM type: G5 Doctoral dissertation (article)


In [6]:
%%time

import os.path

for test_file in dataset_test_files:
    output_file = "gpt3-" + os.path.basename(test_file)
    print(f"generating metadata for {test_file} into {output_file}")
    nrec = 0
    with open(test_file) as infile, open(output_file, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            generated_metadata = get_completions(rec["text"])
            outrec = {"id": rec["id"], "url": rec["url"], "ground_truth": rec["metadata"], "prediction": generated_metadata}
            json.dump(outrec, outfile)
            outfile.write("\n")
            nrec += 1
    print(f"completed {nrec} records")
    print()

generating metadata for ../../llm-dataset/mono-swe-test.jsonl into gpt3-mono-swe-test.jsonl


completed 8 records

generating metadata for ../../llm-dataset/docthes-swe-test.jsonl into gpt3-docthes-swe-test.jsonl
completed 5 records

generating metadata for ../../llm-dataset/thes-eng-test.jsonl into gpt3-thes-eng-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/thes-swe-test.jsonl into gpt3-thes-swe-test.jsonl
completed 16 records

generating metadata for ../../llm-dataset/thes-fin-test.jsonl into gpt3-thes-fin-test.jsonl
completed 21 records

generating metadata for ../../llm-dataset/docthes-fin-test.jsonl into gpt3-docthes-fin-test.jsonl
completed 9 records

generating metadata for ../../llm-dataset/serial-fin-test.jsonl into gpt3-serial-fin-test.jsonl
completed 18 records

generating metadata for ../../llm-dataset/serial-swe-test.jsonl into gpt3-serial-swe-test.jsonl
completed 14 records

generating metadata for ../../llm-dataset/mono-fin-test.jsonl into gpt3-mono-fin-test.jsonl
completed 17 records

generating metadata for ../../llm-dataset/serial-

In [7]:
# Convert the results FinGreyLit dataschema and save to file

from glob import glob
import json


records =[]
prediction_records_files = glob("gpt3-*.jsonl")

KEYS_MAP = {
    "Contributor":		"dc.contributor",
    "Author":		    "dc.contributor.author",
    "Supervisor":		"dc.contributor.degreeSupervisor",
    "Department":		"dc.contributor.department",
    "Editor":		    "dc.contributor.editor",
    "Faculty":		    "dc.contributor.faculty",
    "Opponent":		    "dc.contributor.opponent",
    "Organization":		"dc.contributor.organization",
    "Org. unit":		"dc.contributor.orgunit",
    "Reviewer":		    "dc.contributor.reviewer",
    "Issued":		    "dc.date.issued",
    "extent":		    "dc.format.extent",
    "Page range":		"dc.format.pagerange",
    "ISBN (printed)":	"dc.identifier.isbn",
    "ISBN (online)":	"dc.identifier.isbn",
    "URN":		        "dc.identifier.urn",
    "Language":		    "dc.language.iso",
    "Publisher":		"dc.publisher",
    "Publisher (online)":"dc.publisher",
    "Contractor":		"dc.relation.contractor",
    "DOI":		        "dc.relation.doi",
    "ISSN (online)":	"dc.relation.eissn",
    "risbn":		    "dc.relation.isbn",
    "Journal name":	"reladc.tion.ispartofjournal",
    "Series name":		"dc.relation.ispartofseries",
    "Issue":		    "dc.relation.issue",
    "Number in series":	"dc.relationnumberinseries",
    "ISSN (printed)":	"dc.relation.pissn",
    "Volume":		    "dc.relation.volume",
    "Series year":		"dc.series.year",
    "Degree program":	"dc.subject.degreeprogram",
    "Discipline":		"dc.subject.discipline",
    "Title":		    "dc.title",
    "Alternative title":"dc.title.alternative",
    "COAR type":		"dc.type.coar",
    "OKM type":		    "dc.type.okm",
    "Thesis level":		"dc.type.ontasot",
}

LIST_FIELDS = [
    "dc.contributor.author",
    "dc.identifier.isbn",
    "dc.relation.isbn",
    "dc.publisher",
]

def convert_to_scheme(metadata_str):
    field_lines = metadata_str.split('\n')
    out = dict.fromkeys(KEYS_MAP.values())  # ensure keys exist

    for fl in field_lines:
        try:
            key, value = fl.split(":", maxsplit=1)
            dc_key = KEYS_MAP[key.strip()]
        except (KeyError, ValueError):
            print(f"Invalid line: {fl}")
            continue
        value = value.strip()
        if dc_key in LIST_FIELDS:
            if out[dc_key] is None:
                out[dc_key] = []
            out[dc_key].append(value)
        else:
            out[dc_key] = value
    return out

prediction_records = []
for rec_file in prediction_records_files:
    print(rec_file)
    doctype = rec_file.split("-")[1]
    with open(rec_file, "rt") as rf:
        for line in rf:
            rec_in = json.loads(line)
            rec_out = {
                "rowid": rec_in["id"],
                "url": rec_in["url"],
                "doctype": doctype,
                }
            rec_out["ground_truth"] = convert_to_scheme(rec_in["ground_truth"])
            rec_out["prediction"] = convert_to_scheme(rec_in["prediction"])
            prediction_records.append(rec_out)


# write output to JSONL file
with open('test-records-' + model + '.jsonl', 'w') as outfile:
    for rec in prediction_records:
        json.dump(rec, outfile)
        outfile.write("\n")


gpt3-thes-swe-test.jsonl
Invalid line: Advisor: Österbacka, Eva
Invalid line: Advisor: Skoog, Matti
Invalid line: Degree programme: Business Information Technology
Invalid line: Academic Supervisor: Wik, Thor-Björn
Invalid line: Study program: Business Administration
Invalid line: Academic Supervisor: Kullman, Christer
Invalid line: Academic Supervisor: Ulenius, Katarina
Invalid line: Degree Programme: Bachelor of Health Care
Invalid line: Academic Supervisor: Malmberg, Bengt
gpt3-mono-eng-test.jsonl
Invalid line: This the bibliographic metadata of example document number {ind}:
Invalid line: This the bibliographic metadata of example document number {ind}:
Invalid line: This the bibliographic metadata of example document number {ind}:
Invalid line: Conference name: ARTECH 2021
Invalid line: Conference date: October 13–15, 2021
Invalid line: Conference place: Aveiro, Portugal
Invalid line: This the bibliographic metadata of example document number {ind}:
gpt3-docthes-swe-test.jsonl
gpt