In [1]:
import pandas as pd
import numpy as np
import os
import json

### Load new records

In [2]:
json_data = pd.read_json('nde_hub_src.clinepidb.json')

In [3]:
json_data.rename(columns={'name': 'Name', 'description': 'Description'}, inplace=True)

In [4]:
data = pd.read_csv('../datasets/nde_prod/ClinEpiDB.csv', lineterminator='\n')

In [5]:
data['_id'] = data['_id'].str.lower()

In [6]:
new_ids = json_data[~json_data['_id'].isin(data['_id'])]
new_ids

Unnamed: 0,_id,Name,Description
2,clinepidb_ds_03d6476c6d,NECTAR2 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
9,clinepidb_ds_17191d35b9,PRISM2 ICEMR Border Cohort,<![CDATA[\n <b>Related studies:</b> \n ...
13,clinepidb_ds_24e941b30f,PERCH Thailand Case Control,Nine sites in seven countries participated in ...
15,clinepidb_ds_2cf32ab2f0,PERCH Kenya Case Control,Nine sites in seven countries participated in ...
28,clinepidb_ds_59bedf2966,PERCH The Gambia Case Control,Nine sites in seven countries participated in ...
30,clinepidb_ds_624583e93e,LLINEUP2 Cluster Randomized Trial,<b>Related studies:</b> \n <blockquote>\n ...
36,clinepidb_ds_7b47eb7469,NECTAR1 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
39,clinepidb_ds_861ea66681,NECTAR4 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
40,clinepidb_ds_86424e44ce,PERCH South Africa Case Control,Nine sites in seven countries participated in ...
46,clinepidb_ds_a83d9defbb,Southern Africa ICEMR Cross-sectional,<b>Objectives:</b>\n <p />\n <blockq...


In [16]:
system_prompt = """
Generate a summary of a dataset entry based on its provided name and description. The summary should be concise, with a maximum of 170 words, and include a separate section with 1-2 sentences describing the method and experimental conditions used, if such details are available.

# Steps

    Identify Key Information: Extract the essential details from the dataset's name and description. Focus on the purpose, contents, and any significant features of the dataset.

    Describe Methods and Conditions (separate section): If available, include a brief description of the methods used to collect data and any relevant experimental conditions.

    Compose Summary: Organize the extracted information into a cohesive summary, ensuring it remains within the 170-word limit.

# Output Format

    A paragraph of up to 170 words.
    A separate section for methods and experimental conditions if available.

# Examples

Input:

    Name: "Urban Traffic Flow Dataset"
    Description: "This dataset includes hourly traffic flows recorded from multiple sensors installed in the city's main roads. Data were collected over a year, capturing peak and off-peak hours. Used for urban planning and congestion management."

Output: 

Methods and Experimental Conditions:
Data collection methods involve the use of state-of-the-art sensors strategically placed to ensure comprehensive coverage and accuracy.

Summary:
The Urban Traffic Flow Dataset comprises data from multiple sensors installed across the city's main roads, recording traffic flows hourly. It covers a full year, capturing patterns during both peak and off-peak hours. This dataset is particularly useful for urban planning and congestion management efforts.

Note: Real examples should include more detailed technical or methodological information if available, while keeping within the word limit.
"""

In [17]:
template = """
Name: \"{name}\"
Description: \"{description}\"
"""

In [18]:
queries = []
for idx, row in new_ids.iterrows():
    
    prompt = template.format(name=row['Name'], description=row['Description'])

    query = {
        "custom_id": f"request-{idx}-{row['_id']}", 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4o-mini", 
            "messages": [{"role": "system", "content": system_prompt},{"role": "user", "content": prompt}],
            "max_tokens": 2048,
        }
    }

    queries.append(query)

In [19]:
queries[0]

{'custom_id': 'request-2-clinepidb_ds_03d6476c6d',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-4o-mini',
  'messages': [{'role': 'system',
    'content': '\nGenerate a summary of a dataset entry based on its provided name and description. The summary should be concise, with a maximum of 170 words, and include a separate section with 1-2 sentences describing the method and experimental conditions used, if such details are available.\n\n# Steps\n\n    Identify Key Information: Extract the essential details from the dataset\'s name and description. Focus on the purpose, contents, and any significant features of the dataset.\n\n    Describe Methods and Conditions (separate section): If available, include a brief description of the methods used to collect data and any relevant experimental conditions.\n\n    Compose Summary: Organize the extracted information into a cohesive summary, ensuring it remains within the 170-word limit.\n\n# Output Format\n\n    A pa

In [22]:
import math
num_files = math.ceil(len(queries) / 50000)

# Split the queries into multiple lists
query_lists = [queries[i:i+50000] for i in range(0, len(queries), 50000)]

# Write each list of queries to a separate file
for i, query_list in enumerate(query_lists):
    filename = f'summary_queries/queries_{i}.jsonl'
    with open(filename, 'w') as f:
        for query in query_list:
            json.dump(query, f)
            f.write('\n')


In [27]:
# Check if number of requessts are within the limit

In [23]:
# Check if token count is within the maximum limit

import tiktoken
enc = tiktoken.get_encoding("o200k_base")
tokens = []

for query in queries:
    content = query['body']['messages'][1]['content']
    token_count = len(enc.encode(content))
    tokens.append(token_count)

print(sum(tokens), max(tokens), sum(tokens)/len(tokens), sep='\n')

31899
3663
2126.6


In [24]:
## Batch API pricing

cost = (sum(tokens) / 1000000) * 0.075

print(f'Total Cost: ${cost}')

Total Cost: $0.0023924249999999997


In [8]:
from openai import OpenAI

In [9]:
with open("../config.json", "r") as config_file:
    config = json.load(config_file)

openai_api_key = config["api_keys"]["openai"]

client = OpenAI(api_key=openai_api_key)

In [27]:
filename = f'summary_queries/queries_0.jsonl'

batch_input_file = client.files.create(
  file=open(filename, "rb"),
  purpose="batch"
)

In [28]:
batch_input_file_id = batch_input_file.id

batch_info = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "NDE GPT Summary Generation (ClinEpiDB)"
    }
)

print(batch_info)

Batch(id='batch_671024538954819080c51cca701b6458', completion_window='24h', created_at=1729111123, endpoint='/v1/chat/completions', input_file_id='file-PG9wNlx2rJRAezm0rBDm24Y4', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1729197523, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'NDE GPT Summary Generation (ClinEpiDB)'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [29]:
print(f"Batch ID: {batch_info.id}")
print(f"File ID: {batch_info.input_file_id}")

Batch ID: batch_671024538954819080c51cca701b6458
File ID: file-PG9wNlx2rJRAezm0rBDm24Y4


In [10]:
# Retrieve results

out_files = []
for batch in ['batch_671024538954819080c51cca701b6458']:
    out_file = client.batches.retrieve(batch).output_file_id
    out_files.append(out_file)

In [11]:
full_response = ''
for out_file in out_files:
    file_response = client.files.content(out_file)
    print(file_response.text)
    full_response += file_response.text

{"id": "batch_req_671026d8b8d88190be87cfb32593b6e6", "custom_id": "request-2-clinepidb_ds_03d6476c6d", "response": {"status_code": 200, "request_id": "6ff925ddfff1e8c780ebdce27b216d9e", "body": {"id": "chatcmpl-AJ55u5fl9NjgvqGZs02NMgyJfY1Q4", "object": "chat.completion", "created": 1729111178, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "**Methods and Experimental Conditions:**  \nThe NECTAR2 trial was a four-arm, single-blind, phase 2 randomized controlled trial conducted between October 29 and December 23, 2020, in Ouelessebougou and the Malaria Research and Training Centre in Bamako, Mali. The study involved a thorough eligibility screening of 1,091 individuals, with data collected on clinical and parasitological parameters at multiple follow-up points up to 28 days post-treatment.\n\n**Summary:**  \nThe NECTAR2 Randomized Controlled Trial investigated the efficacy and safety of single low doses of tafenoquine combined with

In [12]:
file_lines = full_response.split('\n')[:-1]
print('Number of outputs:', len(file_lines))

Number of outputs: 15


In [13]:
json_lines = [json.loads(line) for line in file_lines]

In [27]:
responses = []

for line in json_lines:
    model_out = line['response']['body']['choices'][0]['message']['content']
    _id = line['custom_id'].split('-')[-1]
    responses.append((_id, model_out))


In [28]:
responses

[('clinepidb_ds_03d6476c6d',
  '**Methods and Experimental Conditions:**  \nThe NECTAR2 trial was a four-arm, single-blind, phase 2 randomized controlled trial conducted between October 29 and December 23, 2020, in Ouelessebougou and the Malaria Research and Training Centre in Bamako, Mali. The study involved a thorough eligibility screening of 1,091 individuals, with data collected on clinical and parasitological parameters at multiple follow-up points up to 28 days post-treatment.\n\n**Summary:**  \nThe NECTAR2 Randomized Controlled Trial investigated the efficacy and safety of single low doses of tafenoquine combined with dihydroartemisinin-piperaquine to reduce gametocyte density and subsequent transmission of *Plasmodium falciparum* in a cohort of children and adults in Mali. The primary objective focused on assessing changes in infectivity 48 hours after treatment, while secondary objectives included evaluating safety, gametocyte carriage, and infectivity at various time points. 

In [37]:
df = pd.DataFrame(responses, columns=['_id', 'Summary'])

In [38]:
df.to_csv('clinepidb_missing_summaries.csv', index=False)

In [35]:
df = pd.read_csv('clinepidb_missing_summaries.csv')

In [41]:
# merged_df = df.merge(data[['Name', 'Description', '_id']], on='_id', how='left')
merged_df = df.merge(new_ids[['Name', 'Description', '_id']], on='_id', how='left')

In [42]:
merged_df.to_csv('clinepidb_missing_summaries.csv', index=False)

In [43]:
merged_df.iloc[1]['Name']

'PRISM2 ICEMR Border Cohort'

In [44]:
merged_df.iloc[1]['Description']

'<![CDATA[\n      <b>Related studies:</b> \n      <blockquote>\n      <ul>\n      <li><a target="_blank" href="https://clinepidb.org/ce/app/workspace/analyses/DS_51b40fe2e2/new">PRISM2 ICEMR Cohort</a></li>\n      <li><a target=_blank href="https://clinepidb.org/ce/app/workspace/analyses/DS_0ad509829e/new">PRISM ICEMR Cohort</a></li>\n      </ul>\n      </blockquote>\n      <p />\n      <b>Background:</b> \n      <p />\n      Uganda is emblematic of other high burden African countries, where progress in reducing the burden of malaria has been slow and difficult to sustain despite the scale up of proven control\n      interventions. Uganda was the first country to implement universal distribution of free long lasting insecticidal nets (LLINs) starting in 2014, with repeated campaigns every 3-4 years. Uganda \n      also has one of the largest indoor residual spraying of insecticide (IRS) programs, focusing on selected high transmission districts using different formulations of insectici

In [45]:
merged_df = pd.read_csv('clinepidb_missing_summaries.csv')

In [46]:
merged_df.head()


Unnamed: 0,_id,Summary,Name,Description
0,clinepidb_ds_03d6476c6d,**Methods and Experimental Conditions:** \nTh...,NECTAR2 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
1,clinepidb_ds_17191d35b9,Methods and Experimental Conditions:\nThe PRIS...,PRISM2 ICEMR Border Cohort,<![CDATA[\n <b>Related studies:</b> \n ...
2,clinepidb_ds_24e941b30f,Methods and Experimental Conditions: \nThe PE...,PERCH Thailand Case Control,Nine sites in seven countries participated in ...
3,clinepidb_ds_2cf32ab2f0,Methods and Experimental Conditions:\nData for...,PERCH Kenya Case Control,Nine sites in seven countries participated in ...
4,clinepidb_ds_59bedf2966,Methods and Experimental Conditions:\nThe data...,PERCH The Gambia Case Control,Nine sites in seven countries participated in ...


In [47]:
merged_df['Summary'] = merged_df['Summary'].str.replace('[*#]', '', regex=True)

In [48]:
merged_df

Unnamed: 0,_id,Summary,Name,Description
0,clinepidb_ds_03d6476c6d,Methods and Experimental Conditions: \nThe NE...,NECTAR2 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
1,clinepidb_ds_17191d35b9,Methods and Experimental Conditions:\nThe PRIS...,PRISM2 ICEMR Border Cohort,<![CDATA[\n <b>Related studies:</b> \n ...
2,clinepidb_ds_24e941b30f,Methods and Experimental Conditions: \nThe PE...,PERCH Thailand Case Control,Nine sites in seven countries participated in ...
3,clinepidb_ds_2cf32ab2f0,Methods and Experimental Conditions:\nData for...,PERCH Kenya Case Control,Nine sites in seven countries participated in ...
4,clinepidb_ds_59bedf2966,Methods and Experimental Conditions:\nThe data...,PERCH The Gambia Case Control,Nine sites in seven countries participated in ...
5,clinepidb_ds_624583e93e,Methods and Experimental Conditions: \nThe LL...,LLINEUP2 Cluster Randomized Trial,<b>Related studies:</b> \n <blockquote>\n ...
6,clinepidb_ds_7b47eb7469,Methods and Experimental Conditions:\nThe NECT...,NECTAR1 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
7,clinepidb_ds_861ea66681,Methods and Experimental Conditions:\nThe NECT...,NECTAR4 Randomized Controlled Trial,<b>Related studies:</b> \n <blockquote>\n ...
8,clinepidb_ds_86424e44ce,Methods and Experimental Conditions: \nThe PE...,PERCH South Africa Case Control,Nine sites in seven countries participated in ...
9,clinepidb_ds_a83d9defbb,Methods and Experimental Conditions:\nThe stud...,Southern Africa ICEMR Cross-sectional,<b>Objectives:</b>\n <p />\n <blockq...


In [49]:
import re

pattern = r'(Methods and Experimental Conditions:|Summary:)'

sections = re.split(pattern, merged_df.iloc[1]['Summary'])

sections = [section.strip() for section in sections if section.strip()]

content_sections = sections[1::2]

summary = content_sections[1] + ' ' + content_sections[0]

print(f"{summary}\n")

The PRISM2 ICEMR Border Cohort aims to investigate the resurgence of malaria in Tororo District, Uganda, following previous interventions with long-lasting insecticidal nets (LLINs) and indoor residual spraying (IRS). The study enrolled 667 residents and compared malaria incidence and prevalence between September 2020 and September 2023. Factors such as the impact of the shift back to organophosphate IRS (Actellic) in March 2023 were also evaluated. Initial data highlighted significant decreases in malaria metrics from prior interventions, but a resurgence was noted in 2020. This cohort study sheds light on the complex dynamics of malaria transmission and vector control efficacy in high-burden areas. The PRISM2 ICEMR Border Cohort is a dynamic prospective cohort study conducted in Tororo and Busia Districts, Uganda, from August 2020 to September 2023. Eligibility criteria included households with at least two members under five years of age, and participants underwent regular medical e

In [50]:
def process_summary(summary):
    pattern = r'(Methods and Experimental Conditions:|Summary:)'
    sections = re.split(pattern, summary)
    sections = [section.strip() for section in sections if section.strip()]
    content_sections = sections[1::2]
    if len(content_sections) == 2:
        return content_sections[1] + ' ' + content_sections[0]
    return summary  # Return the original summary if the expected sections are not found

# Apply the function to the entire Summary column
merged_df['Processed Summary'] = merged_df['Summary'].apply(process_summary)

# Print the DataFrame with the new column
merged_df[['Summary', 'Processed Summary']]

Unnamed: 0,Summary,Processed Summary
0,Methods and Experimental Conditions: \nThe NE...,The NECTAR2 Randomized Controlled Trial invest...
1,Methods and Experimental Conditions:\nThe PRIS...,The PRISM2 ICEMR Border Cohort aims to investi...
2,Methods and Experimental Conditions: \nThe PE...,The PERCH Thailand Case Control dataset encomp...
3,Methods and Experimental Conditions:\nData for...,"The ""PERCH Kenya Case Control"" dataset feature..."
4,Methods and Experimental Conditions:\nThe data...,"The ""PERCH The Gambia Case Control"" dataset pr..."
5,Methods and Experimental Conditions: \nThe LL...,The LLINEUP2 Cluster Randomized Trial aims to ...
6,Methods and Experimental Conditions:\nThe NECT...,The NECTAR1 trial investigated the effectivene...
7,Methods and Experimental Conditions:\nThe NECT...,The NECTAR4 Randomized Controlled Trial aimed ...
8,Methods and Experimental Conditions: \nThe PE...,The PERCH South Africa Case Control dataset in...
9,Methods and Experimental Conditions:\nThe stud...,"The ""Southern Africa ICEMR Cross-sectional"" da..."


In [51]:
merged_df.to_csv('clinepidb_missing_summaries_processed.csv', index=False)

In [52]:
print(merged_df.iloc[1]['Processed Summary'])

The PRISM2 ICEMR Border Cohort aims to investigate the resurgence of malaria in Tororo District, Uganda, following previous interventions with long-lasting insecticidal nets (LLINs) and indoor residual spraying (IRS). The study enrolled 667 residents and compared malaria incidence and prevalence between September 2020 and September 2023. Factors such as the impact of the shift back to organophosphate IRS (Actellic) in March 2023 were also evaluated. Initial data highlighted significant decreases in malaria metrics from prior interventions, but a resurgence was noted in 2020. This cohort study sheds light on the complex dynamics of malaria transmission and vector control efficacy in high-burden areas. The PRISM2 ICEMR Border Cohort is a dynamic prospective cohort study conducted in Tororo and Busia Districts, Uganda, from August 2020 to September 2023. Eligibility criteria included households with at least two members under five years of age, and participants underwent regular medical e