In [1]:
"""Basic utilities module"""
import requests
import csv
import re
from io import StringIO
import pandas as pd


def request_ct(url):
    """Performs a get request that provides a (somewhat) useful error message."""
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.HTTPError as ex:
        raise ex
    except ImportError:
        raise ImportError(
            "Couldn't retrieve the data, check your search expression or try again later."
        )
    else:
        return response

def json_handler(url):
    """Returns request in JSON (dict) format"""
    return request_ct(url).json()

def csv_handler(url):
    """Returns request in CSV (list of records) format"""

    response = request_ct(url)
    decoded_content = response.content.decode("utf-8")

    cr = csv.reader(decoded_content.splitlines(), delimiter=",")
    records = list(cr)

    return records

filepath = 'src/pytrialsV2/study_fields.csv'

def study_fields(file_path):
    """List of all study fields you can use in your query."""

    csv_fields = []
    json_fields = []
    with open(file_path, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            csv_fields.append(row["Column Name"])
            json_fields.append(row["Included Data Fields"].split("|"))

    return {
        "csv": csv_fields,
        "json": [item for sublist in json_fields for item in sublist],
    }

study_fields("src/pytrialsV2/study_fields.csv")

def get_full_studies(search_expr, max_studies=50, fmt="json"):
    if fmt == "json":
        format = "format=json"
        handler = json_handler
    elif fmt == "csv":
        format = "format=csv"
        handler = csv_handler
    else:
        raise ValueError("Format argument has to be 'json")

    if max_studies < 1:
        raise ValueError("The number of studies can only be greater than 0")

    all_studies = []
    pageToken = None
    while len(all_studies) < max_studies:
        req = f"studies?{format}&markupFormat=legacy&query.term={search_expr}&pageSize={max_studies}"
        if pageToken:
            req += f"&pageToken={pageToken}"
        if fmt == "json":
            response = json_handler(f"https://clinicaltrials.gov/api/v2/{req}")
            full_studies = response['studies']
            if 'nextPageToken' in response:
                pageToken = response['nextPageToken']
        else:  # fmt == "csv"
            response = requests.get(f"https://clinicaltrials.gov/api/v2/{req}")
            csv_reader = csv.reader(StringIO(response.text))
            full_studies = list(csv_reader)
            pageToken = response.headers.get('x-next-page-token')
        all_studies.extend(full_studies)
        if not pageToken:
            break
    return all_studies[:max_studies]

def get_study_fields(search_expr, fields, max_studies=50, fmt="csv"):
    if fmt == "json":
        format = "format=json"
        handler = json_handler
    elif fmt == "csv":
        format = "format=csv"
        handler = csv_handler
    else:
        raise ValueError("Format argument has to be either 'csv' or 'json")

    if not set(fields).issubset(study_fields("src/pytrialsV2/study_fields.csv")[fmt]):
        raise ValueError(
            "One of the fields is not valid!"
            "Check the study_fields attribute for a list of valid ones."
            "They are different depending on the return format, json or csv."
        )

    all_studies = []
    pageToken = None
    while len(all_studies) < max_studies:
        concat_fields = "|".join(fields)
        req = f"&query.term={search_expr}&markupFormat=legacy&fields={concat_fields}&pageSize={max_studies}"
        if pageToken:
            req += f"&pageToken={pageToken}"
        url = f"https://clinicaltrials.gov/api/v2/studies?{format}{req}"
        response = handler(url)
        if fmt == "json":
            response = json_handler(f"https://clinicaltrials.gov/api/v2/{format}{req}")
            full_studies = response['studies']
            if 'nextPageToken' in response:
                pageToken = response['nextPageToken']
        else:  # fmt == "csv"
            response = requests.get(f"https://clinicaltrials.gov/api/v2/{format}{req}")
            csv_reader = csv.reader(StringIO(response.text))
            full_studies = list(csv_reader)
            pageToken = response.headers.get('x-next-page-token')
        all_studies.extend(full_studies)
        if not pageToken:
            break

    return all_studies[:max_studies]




### Pull all data for all trials in last 5 years

In [18]:
import pandas as pd
from datetime import datetime, timedelta

# Calculate the start date (one year ago)
start_date = (datetime.now() - timedelta(days=int(1))).strftime('%Y-%m-%d')

today = datetime.now().strftime('%Y-%m-%d')

last_five_years = get_full_studies(search_expr=f"AREA[StartDate]RANGE[{start_date}, {today}]", max_studies=500000, fmt="json")

df = pd.DataFrame.from_records(last_five_years[1:], columns=last_five_years[0])

print(df.columns)

df.head()

Index(['protocolSection', 'derivedSection', 'hasResults'], dtype='object')


Unnamed: 0,protocolSection,derivedSection,hasResults
0,{'identificationModule': {'nctId': 'NCT0643567...,{'miscInfoModule': {'versionHolder': '2024-05-...,False
1,{'identificationModule': {'nctId': 'NCT0610027...,{'miscInfoModule': {'versionHolder': '2024-05-...,False
2,{'identificationModule': {'nctId': 'NCT0621808...,{'miscInfoModule': {'versionHolder': '2024-05-...,False
3,{'identificationModule': {'nctId': 'NCT0643107...,{'miscInfoModule': {'versionHolder': '2024-05-...,False
4,{'identificationModule': {'nctId': 'NCT0627776...,{'miscInfoModule': {'versionHolder': '2024-05-...,False


In [19]:
last_five_years

[{'protocolSection': {'identificationModule': {'nctId': 'NCT06432127',
    'orgStudyIdInfo': {'id': '256/2024'},
    'organization': {'fullName': 'Mahidol University', 'class': 'OTHER'},
    'briefTitle': 'Role of Ultrasound Guide Greater Occipital Nerve Block at Second Cervical Vertebra in Migraine Headache Prophylaxis',
    'officialTitle': 'Role of Ultrasound Guide Greater Occipital Nerve Block at Second Cervical Vertebra in Migraine Headache Prophylaxis in Patients With Failed Oral Prophylaxis Medication : A Randomized Controlled Study in Thailand'},
   'statusModule': {'statusVerifiedDate': '2024-05',
    'overallStatus': 'NOT_YET_RECRUITING',
    'expandedAccessInfo': {'hasExpandedAccess': False},
    'startDateStruct': {'date': '2024-05-30', 'type': 'ESTIMATED'},
    'primaryCompletionDateStruct': {'date': '2026-01', 'type': 'ESTIMATED'},
    'completionDateStruct': {'date': '2027-01', 'type': 'ESTIMATED'},
    'studyFirstSubmitDate': '2024-05-15',
    'studyFirstSubmitQcDate': 

Given the comprehensive list of fields available in the ClinicalTrials.gov dataset, we can derive valuable insights for a competitor benchmark analysis. We'll focus on key areas like:

- Sponsor Activity: Identify the most active sponsors, their areas of focus (therapeutic areas), and trends in their trial initiations over time.
- Study Designs and Phases: Analyze the distribution of study designs (e.g., randomized, observational) and phases (Phase 1, 2, 3) to understand industry trends and competitor strategies.
- Therapeutic Areas: Determine the most common therapeutic areas being investigated, highlighting potential areas of high competition or unmet medical needs.
- Collaboration Patterns: Examine collaborations between sponsors and other organizations (e.g., academic institutions) to identify potential partnership opportunities or competitive alliances.
- Geographic Distribution: Analyze the geographic locations of trial sites to understand where competitors are focusing their research efforts.

First it is crucial to identify what is meant by "Top Sponsor"

In the context of this task, "top sponsors" likely refers to the organizations or entities that have initiated (sponsored) the highest number of clinical studies/trials within the specified time frame (the last 5 years).  This metric could be used to identify the most active players in the clinical research landscape, and the insights gained could be valuable for a competitor benchmark analysis.

However, there could be some ambiguity in this term. Depending on the specific goals of the competitor benchmark analysis, "top sponsors" could also be interpreted in other ways:

- Sponsors with the largest trials: Instead of focusing on the number of trials, this interpretation would prioritize sponsors that have initiated trials with the largest number of participants. This could be a relevant metric if the goal is to understand which sponsors are investing in the most extensive research efforts.
- Sponsors with the most successful trials: This interpretation would focus on sponsors whose trials have achieved the most positive outcomes, such as FDA approval or publication in major medical journals. This could be a valuable metric if the goal is to identify sponsors with a track record of success in clinical research.

Study Populations

Therefore we identify 3 study populations:

- a) Sponsors with the highest number of trials
- b) Sponsors with the largest trials
- c) Sponsors with the most successful trials
