In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import requests
import pandas as pd
from io import StringIO



In [2]:
def get_studies_with_nextPageToken(token):
  if token=="":
    url = "https://clinicaltrials.gov/api/v2/studies?format=csv&aggFilters=docs:icf"
  else:
    url = "https://clinicaltrials.gov/api/v2/studies?format=csv&aggFilters=docs:icf&pageToken="+token
  # Making a GET request
  result = requests.get(url)

  if result.status_code == 200:
    # content of request
    if token=="":
      df = pd.read_csv(StringIO(result.content.decode("utf-8")), sep=',')
    else:
      df = pd.read_csv(StringIO(result.content.decode("utf-8")), sep=',', header=None)
    return df, result.headers
  else:
    return "NO study fetched", "Empty"


In [3]:
studies_with_icf, headers = get_studies_with_nextPageToken("")
studies_df = studies_with_icf

In [4]:
while 'x-next-page-token' in headers:
  studies_with_icf, headers = get_studies_with_nextPageToken(headers['x-next-page-token'])
  studies_with_icf.columns = studies_df.columns
  # studies_df = studies_df.append(cancer_studies_with_icf)
  studies_df = pd.concat([studies_with_icf, studies_df])



In [5]:
studies_df.columns

Index(['NCT Number', 'Study Title', 'Study URL', 'Acronym', 'Study Status',
       'Brief Summary', 'Study Results', 'Conditions', 'Interventions',
       'Primary Outcome Measures', 'Secondary Outcome Measures',
       'Other Outcome Measures', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Study Design',
       'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents'],
      dtype='object')

In [7]:
studies_df.to_csv("C:/Users/Aman Varshney/Downloads/study_with_icf.csv", index=False)

In [8]:
studies_df.shape

(7863, 30)

In [4]:
studies_df = pd.read_csv("C:/Users/Aman Varshney/Downloads/study_with_icf.csv")

In [7]:
studies_df.shape

(7863, 30)

In [5]:
y = studies_df['Study Documents'].iloc[910]
y

'Study Protocol and Statistical Analysis Plan, https://storage.googleapis.com/ctgov2-large-docs/74/NCT02901574/Prot_SAP_001.pdf|Informed Consent Form, https://storage.googleapis.com/ctgov2-large-docs/74/NCT02901574/ICF_000.pdf'

In [8]:
def download_icf(url, nct_number):
  try:
    r = requests.get(url, allow_redirects=True)

    open("C:/Users/Aman Varshney/Downloads/ICF/"+nct_number+".pdf", 'wb').write(r.content)

  except Exception as e:
    return e

  return True

In [9]:
def extract_url(text):
  import re
  regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
  url = re.findall(regex,text)
  return [text[0] for text in url]

In [10]:
for index, row in studies_df.iterrows():
    nct_number = row['NCT Number']
    study_docs = row['Study Documents']
    urls = extract_url(study_docs)
    print(f"Downloading ICF for {index} , {nct_number}")
    download_icf(urls[-1], nct_number)

Downloading ICF for 0 , NCT01806870
Downloading ICF for 1 , NCT04410211
Downloading ICF for 2 , NCT04362111
Downloading ICF for 3 , NCT04252170
Downloading ICF for 4 , NCT03252470
Downloading ICF for 5 , NCT05132699
Downloading ICF for 6 , NCT04646499
Downloading ICF for 7 , NCT00051311
Downloading ICF for 8 , NCT05031611
Downloading ICF for 9 , NCT03655470
Downloading ICF for 10 , NCT05010070
Downloading ICF for 11 , NCT05513170
Downloading ICF for 12 , NCT03014570
Downloading ICF for 13 , NCT03982511
Downloading ICF for 14 , NCT03818711
Downloading ICF for 15 , NCT06101199
Downloading ICF for 16 , NCT03988699
Downloading ICF for 17 , NCT03421899
Downloading ICF for 18 , NCT04041570
Downloading ICF for 19 , NCT03364270
Downloading ICF for 20 , NCT01764711
Downloading ICF for 21 , NCT03437811
Downloading ICF for 22 , NCT02269670
Downloading ICF for 23 , NCT05801770
Downloading ICF for 24 , NCT01903811
Downloading ICF for 25 , NCT03168711
Downloading ICF for 26 , NCT03637413
Downloading