In [None]:
import requests
import pandas as pd
from io import StringIO

In [2]:
def get_studies_with_nextPageToken(token):
  if token=="":
    url = "https://clinicaltrials.gov/api/v2/studies?format=csv&aggFilters=docs:icf"
  else:
    url = "https://clinicaltrials.gov/api/v2/studies?format=csv&aggFilters=docs:icf&pageToken="+token
  # Making a GET request
  result = requests.get(url)

  if result.status_code == 200:
    # content of request
    if token=="":
      df = pd.read_csv(StringIO(result.content.decode("utf-8")), sep=',')
    else:
      df = pd.read_csv(StringIO(result.content.decode("utf-8")), sep=',', header=None)
    return df, result.headers
  else:
    return "NO study fetched", "Empty"


In [3]:
studies_with_icf, headers = get_studies_with_nextPageToken("")
studies_df = studies_with_icf

In [4]:
while 'x-next-page-token' in headers:
  studies_with_icf, headers = get_studies_with_nextPageToken(headers['x-next-page-token'])
  studies_with_icf.columns = studies_df.columns
  # studies_df = studies_df.append(cancer_studies_with_icf)
  studies_df = pd.concat([studies_with_icf, studies_df])



In [5]:
studies_df.columns

Index(['NCT Number', 'Study Title', 'Study URL', 'Acronym', 'Study Status',
       'Brief Summary', 'Study Results', 'Conditions', 'Interventions',
       'Primary Outcome Measures', 'Secondary Outcome Measures',
       'Other Outcome Measures', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Study Design',
       'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents'],
      dtype='object')

In [None]:
studies_df.to_csv("study_with_icf.csv", index=False)

In [None]:
studies_df = pd.read_csv("study_with_icf.csv")

In [None]:
def download_icf(url, nct_number):
  try:
    r = requests.get(url, allow_redirects=True)

    open(nct_number+".pdf", 'wb').write(r.content)

  except Exception as e:
    return e

  return True

In [9]:
def extract_url(text):
  import re
  regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
  url = re.findall(regex,text)
  return [text[0] for text in url]

In [None]:
for index, row in studies_df.iterrows():
    nct_number = row['NCT Number']
    study_docs = row['Study Documents']
    urls = extract_url(study_docs)
    print(f"Downloading ICF for {index} , {nct_number}")
    download_icf(urls[-1], nct_number)