In [71]:
"""Basic utilities module"""
import requests
import csv
import re
from io import StringIO
import pandas as pd


def request_ct(url):
    """Performs a get request that provides a (somewhat) useful error message."""
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.HTTPError as ex:
        raise ex
    except ImportError:
        raise ImportError(
            "Couldn't retrieve the data, check your search expression or try again later."
        )
    else:
        return response

def json_handler(url):
    """Returns request in JSON (dict) format"""
    return request_ct(url).json()

def csv_handler(url):
    """Returns request in CSV (list of records) format"""

    response = request_ct(url)
    decoded_content = response.content.decode("utf-8")

    cr = csv.reader(decoded_content.splitlines(), delimiter=",")
    records = list(cr)

    return records

filepath = 'src/pytrialsV2/study_fields.csv'

def study_fields(file_path):
    """List of all study fields you can use in your query."""

    csv_fields = []
    json_fields = []
    with open(file_path, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            csv_fields.append(row["Column Name"])
            json_fields.append(row["Included Data Fields"].split("|"))

    return {
        "csv": csv_fields,
        "json": [item for sublist in json_fields for item in sublist],
    }

study_fields("src/pytrialsV2/study_fields.csv")

def get_full_studies(search_expr, max_studies=50, fmt="json"):
    if fmt == "json":
        format = "format=json"
        handler = json_handler
    elif fmt == "csv":
        format = "format=csv"
        handler = csv_handler
    else:
        raise ValueError("Format argument has to be 'json")

    if max_studies < 1:
        raise ValueError("The number of studies can only be greater than 0")

    all_studies = []
    pageToken = None
    while len(all_studies) < max_studies:
        req = f"studies?{format}&markupFormat=legacy&query.term={search_expr}&pageSize={max_studies}"
        if pageToken:
            req += f"&pageToken={pageToken}"
        if fmt == "json":
            response = json_handler(f"https://clinicaltrials.gov/api/v2/{req}")
            full_studies = response['studies']
            if 'nextPageToken' in response:
                pageToken = response['nextPageToken']
        else:  # fmt == "csv"
            response = requests.get(f"https://clinicaltrials.gov/api/v2/{req}")
            csv_reader = csv.reader(StringIO(response.text))
            full_studies = list(csv_reader)
            pageToken = response.headers.get('x-next-page-token')
        all_studies.extend(full_studies)
        if not pageToken:
            break
    return all_studies[:max_studies]

def get_study_fields(search_expr, fields, max_studies=50, fmt="csv"):
    if fmt == "json":
        format = "format=json"
        handler = json_handler
    elif fmt == "csv":
        format = "format=csv"
        handler = csv_handler
    else:
        raise ValueError("Format argument has to be either 'csv' or 'json")

    if not set(fields).issubset(study_fields("src/pytrialsV2/study_fields.csv")[fmt]):
        raise ValueError(
            "One of the fields is not valid!"
            "Check the study_fields attribute for a list of valid ones."
            "They are different depending on the return format, json or csv."
        )

    all_studies = []
    pageToken = None
    while len(all_studies) < max_studies:
        concat_fields = "|".join(fields)
        req = f"&query.term={search_expr}&markupFormat=legacy&fields={concat_fields}&pageSize={max_studies}"
        if pageToken:
            req += f"&pageToken={pageToken}"
        url = f"https://clinicaltrials.gov/api/v2/studies?{format}{req}"
        response = handler(url)
        if fmt == "json":
            response = json_handler(f"https://clinicaltrials.gov/api/v2/{req}")
            full_studies = response['studies']
            if 'nextPageToken' in response:
                pageToken = response['nextPageToken']
        else:  # fmt == "csv"
            response = requests.get(f"https://clinicaltrials.gov/api/v2/{req}")
            csv_reader = csv.reader(StringIO(response.text))
            full_studies = list(csv_reader)
            pageToken = response.headers.get('x-next-page-token')
        all_studies.extend(full_studies)
        if not pageToken:
            break

    return all_studies[:max_studies]


In [58]:
last_five_years = get_full_studies(search_expr="Coronavirus+COVID", max_studies=1001, fmt="csv")


In [57]:
import pandas as pd

df = pd.DataFrame.from_records(last_five_years[1:], columns=last_five_years[0])

print(df.columns)

df.head()

Index(['NCT Number', 'Study Title', 'Study URL', 'Acronym', 'Study Status',
       'Brief Summary', 'Study Results', 'Conditions', 'Interventions',
       'Primary Outcome Measures', 'Secondary Outcome Measures',
       'Other Outcome Measures', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Study Design',
       'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents'],
      dtype='object')


Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents
0,NCT05060991,Impact of Immunosuppression Adjustment on COVI...,https://clinicaltrials.gov/study/NCT05060991,ADIVKT,UNKNOWN,"Immunocompromised individuals, such as solid o...",NO,COVID-19|Immunosuppression|Vaccine Response Im...,DRUG: Reduction in antimetabolite immunosuppre...,Change in anti-SARS-CoV-2 IgG titer to SARS-Co...,...,Allocation: RANDOMIZED|Intervention Model: PAR...,1801321-1,2021-09-24,2022-12-24,2023-01-24,2021-09-29,,2022-02-08,"University of California, Davis, Sacramento, C...",
1,NCT04788394,Renal Involvement in Hospitalized Children Wit...,https://clinicaltrials.gov/study/NCT04788394,RIHCC,COMPLETED,Covid-19 is an important human and animal path...,NO,Acute Kidney Injury|Covid19|Renal Dysfunction|...,OTHER: data review|OTHER: data comparing,determine the prevalence of renal dysfunction ...,...,Observational Model: |Time Perspective: p,MRC01-21-089,2021-03-01,2021-12-31,2021-12-31,2021-03-09,,2023-07-25,"Hamad General corporation, Doha, 3050, Qatar",
2,NCT04404062,Serological and PCR Testing for COVID-19,https://clinicaltrials.gov/study/NCT04404062,,RECRUITING,Richmond Research Institute (RRI) is applying ...,NO,COVID-19,DIAGNOSTIC_TEST: Membrane-based immunoassay ki...,"Identification of carriers of SARS-CoV-2, To i...",...,Observational Model: |Time Perspective: p,C20010,2020-03-16,2024-12,2024-12,2020-05-27,,2023-03-30,"Richmond Pharmacology Ltd. 1a Newcomen St, Lon...",
3,NCT05752162,Perioperative Complications Linked to COVID-19...,https://clinicaltrials.gov/study/NCT05752162,,RECRUITING,A. Primary objective\n\nIdentifying risk facto...,NO,Postoperative Complications|COVID-19,,Identifying risk factors for peri-operative co...,...,Observational Model: |Time Perspective: p,20083/11.07.2022,2022-07-20,2022-07-30,2025-06-30,2023-03-02,,2023-03-02,"IUBCV Prof. Dr. CC Iliescu, Bucharest, Romania...",
4,NCT04929691,The CircumVent Project: A CPAP/O2 Helmet Solut...,https://clinicaltrials.gov/study/NCT04929691,CircumVent,UNKNOWN,The purpose of the CircumVent Project is to ev...,NO,SARS-CoV-2 Acute Respiratory Disease|COVID-19 ...,DEVICE: CPAP helmet|OTHER: Standard of care no...,"Respiratory rate, mild, 11-19/min; moderate, 2...",...,Allocation: NON_RANDOMIZED|Intervention Model:...,CDC Foundation Award 1085.1,2020-11-13,2021-12,2021-12,2021-06-18,,2021-06-23,"Nigerian Institute of Medical Research, Yaba, ...",


In [74]:
corona_fields = get_study_fields(search_expr="Coronavirus+COVID",
    fields=["NCT Number", "Conditions", "Study Title"],
    max_studies=1000,
    fmt="json")

ValueError: One of the fields is not valid!Check the study_fields attribute for a list of valid ones.They are different depending on the return format, json or csv.

In [81]:
corona_fields = get_study_fields(search_expr="Coronavirus+COVID",
    fields=['NCTId','BriefTitle','NCTId'],
    max_studies=1000,
    fmt="json")

HTTPError: 404 Client Error: Not Found for url: https://clinicaltrials.gov/api/v2/&query.term=Coronavirus+COVID&markupFormat=legacy&fields=NCTId%7CBriefTitle%7CNCTId&pageSize=1000

In [80]:
set(['NCTId','BriefTitle','NCTId']).issubset(study_fields("src/pytrialsV2/study_fields.csv")["json"])

True

In [79]:
study_fields("src/pytrialsV2/study_fields.csv")["json"]


['NCTId',
 'BriefTitle',
 'NCTId',
 'Acronym',
 'OverallStatus',
 'BriefSummary',
 'HasResults',
 'Condition',
 'InterventionType',
 'InterventionName',
 'PrimaryOutcomeMeasure',
 'PrimaryOutcomeDescription',
 'PrimaryOutcomeTimeFrame',
 'SecondaryOutcomeMeasure',
 'SecondaryOutcomeDescription',
 'SecondaryOutcomeTimeFrame',
 'OtherOutcomeMeasure',
 'OtherOutcomeDescription',
 'OtherOutcomeTimeFrame',
 'LeadSponsorName',
 'CollaboratorName',
 'Sex',
 'MinimumAge',
 'MaximumAge',
 'StdAge',
 'Phase',
 'EnrollmentCount',
 'LeadSponsorClass',
 'StudyType',
 'DesignAllocation',
 'DesignInterventionModel',
 'DesignMasking',
 'DesignWhoMasked',
 'DesignPrimaryPurpose',
 'OrgStudyId',
 'SecondaryId',
 'StartDate',
 'PrimaryCompletionDate',
 'CompletionDate',
 'StudyFirstPostDate',
 'ResultsFirstSubmitDate',
 'LastUpdatePostDate',
 'LocationFacility',
 'LocationCity',
 'LocationState',
 'LocationZip',
 'LocationCountry',
 'NCTId',
 'LargeDocLabel',
 'LargeDocFilename']

Given the comprehensive list of fields available in the ClinicalTrials.gov dataset, we can derive valuable insights for a competitor benchmark analysis. We'll focus on key areas like:

- Sponsor Activity: Identify the most active sponsors, their areas of focus (therapeutic areas), and trends in their trial initiations over time.
- Study Designs and Phases: Analyze the distribution of study designs (e.g., randomized, observational) and phases (Phase 1, 2, 3) to understand industry trends and competitor strategies.
- Therapeutic Areas: Determine the most common therapeutic areas being investigated, highlighting potential areas of high competition or unmet medical needs.
- Collaboration Patterns: Examine collaborations between sponsors and other organizations (e.g., academic institutions) to identify potential partnership opportunities or competitive alliances.
- Geographic Distribution: Analyze the geographic locations of trial sites to understand where competitors are focusing their research efforts.