## Installs/ Imports

In [25]:
!pip install BeautifulSoup4



In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Data Scraping

The code below is responsible for scraping Google Scholars for responses to the query "patient trial matching". We retrieve the rank, title, hyperlink and partial abstract of the first 1000 articles. The requests are sent and processed in batches of 10 articles with timeouts and retries if the server is non-responsive. The responses for each batch are parsed, processed and saved in `google_scholar_patient_trial_matching_data.csv`.

In [27]:
def write_csv_file(data_list):
    """ Function used to save the scraped data from `data_list` to a csv file, namely
    `google_scholar_patient_trial_matching_data.csv`. The information is appended to
    the end of the csv file.
    """
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data_list)
    # Specify the file path where you want to save the CSV file
    file_path = "google_scholar_patient_trial_matching_data.csv"
    # Save the DataFrame to a CSV file
    df.to_csv(file_path, index=False)
    print(f"Data has been appended to '{file_path}' in CSV format.")

The function below is designed to manage retries in case the server encounters non-responsive behavior or returns various status codes. Additionally, in the event of receiving a 429 status code, it incorporates a delay mechanism that gradually increases if a 200 response is not achieved. This iterative delay strategy persists until the maximum number of retry attempts is reached.

In [28]:
def send_request_with_retries(req_url, max_retries, headers, payload):
    """ Sends the requests for a batch of articles until either the request is successful (status code 200),
    in which case it returns the response from server, or until `max_retries` is reached, in which case it
    prints out an errror message and returns None. In case of an unsuccessful request, sets progressively
    longer delays before the next request.
    """
    # Create a session
    session = requests.Session()
    # Configure retries
    retries = Retry(total=max_retries, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    # Mount the adapter with retries to the session
    session.mount("https://", HTTPAdapter(max_retries=retries))
    try:
        response = session.get(req_url, headers=headers, data=payload)
        # Check if the request was successful (HTTP status code 200)
        if response.status_code == 200:
            return response
        else:
            # ADDED MORE RETRIES IF STATUS CODE IS 429 WITH DELAY
            retry_attempts = 0
            while retry_attempts < max_retries:
                response = session.get(req_url, headers=headers, data=payload)
                if response.status_code == 429 or response.status_code == 403:
                    # Retry after waiting with exponential backoff
                    wait_time = 2 ** retry_attempts  # Wait for 2^retry_attempts seconds
                    time.sleep(wait_time)
                    retry_attempts += 1
                elif response.status_code == 200:
                    return response
                else:
                    break
            print(f"Request failed with status code {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None

The get_search_data() function efficiently conducts searches across multiple pages (performing peginated search 1 by 1), performs data scraping, and meticulously compiles the required information. The resulting dataset is then meticulously saved to a CSV file for comprehensive data management and analysis

In [29]:
def get_search_data():
    """ Function that fetches, parses and saves the data from the top 1000 articles appearing in response to
    a "patient trial matching" search on Google Scholar.
    """
    meta_data = []
    rank_value = 1
    page_count = 1
    response = requests.get(url="https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=patient+trial+matching&btnG=")
    cookies = response.cookies
    # Initialize an empty string to store the concatenated cookies
    concatenated_cookies = ""
    # Iterate through the cookies and concatenate their names and values
    for cookie in cookies:
        concatenated_cookies += f"{cookie.name}={cookie.value}; "
    # Remove the trailing semicolon and space
    concatenated_cookies = concatenated_cookies.rstrip("; ")
    for i in range(0, 1020, 10):
        # I am handling pagination here, loop will start from 0 and stop at 1000 by the increment of 10 at each
        # iteration
        search_url = f"https://scholar.google.de/scholar?start={i}&q=patient+trial+matching&hl=de&as_sdt=0,5"
        payload = {}
        headers = {
            'authority': 'scholar.google.de',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'en-US,en;q=0.9',
            'cookie': concatenated_cookies,
            'referer': f'https://scholar.google.de/scholar?hl=de&as_sdt={i - 10}%2C5&q=patient+trial+matching&btnG=',
            'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            'x-client-data': 'CJW2yQEIorbJAQipncoBCNr5ygEIkqHLAQiFoM0BCIinzQEI3L3NAQjgxM0BCLnKzQEIuM3NAQiTz80B'
        }
        max_retries = 5
        # adding delay in necessary when you want to send many requests to a server
        time.sleep(4)  # you can increase it ( like 6 seconds or more) just in case if you receive 429 or 403 status
        response = send_request_with_retries(search_url, max_retries, headers, payload)

        if response:
            # Assuming 'html' contains the HTML content of the webpage
            soup = BeautifulSoup(response.text, 'html.parser')
            # getting all tags containing titles
            titles = soup.find_all(class_='gs_rt')
            abstracts = soup.find_all(class_='gs_rs')
            for i in range(0, len(titles)):
                item = dict()
                item["Rank"] = rank_value
                # increment rank value for each record
                rank_value = rank_value + 1
                # Find the element by its class and I am cleaning it to make sure there's no Unicode characters
                item["Title"] = titles[i].findChild(name='a').get_text().strip().encode('ascii', 'ignore').decode('ascii') if titles[i].findChild(name='a') else titles[
                    i].findChild(name='a')
                item["Link"] = titles[i].findChild(name='a')['href'] if titles[i].findChild(name='a') else titles[
                    i].findChild(name='a')
                item["Abstract"] = abstracts[i].get_text().strip().encode('ascii', 'ignore').decode('ascii').strip().replace("\n", "")
                meta_data.append(item)
                print(item)
        print(f"********************* {page_count}-PAGE SCRAPED !! *********************")
        if meta_data:
            write_csv_file(meta_data)
        page_count = page_count + 1

In [30]:
get_search_data()


{'Rank': 1, 'Title': 'Project MATCH: Rationale and methods for a multisite clinical trial matching patients to alcoholism treatment', 'Link': 'https://ohsu.elsevierpure.com/en/publications/project-match-rationale-and-methods-for-a-multisite-clinical-tria-2', 'Abstract': 'Interaction effects with selected patient characteristics will be studied. Project MATCH will provide a rigorous test of the utility of patient-treatment matching in general and, depending on'}
{'Rank': 2, 'Title': 'DeepEnroll: patient-trial matching with deep embedding and entailment prediction', 'Link': 'https://dl.acm.org/doi/abs/10.1145/3366423.3380181', 'Abstract': 'We evaluated DeepEnroll on both real world clinical trial dataset and a synthetic data. We evaluated the patient-trial matching via predicting patient enrollment for trials. DeepEnroll'}
{'Rank': 3, 'Title': 'COMPOSE: Cross-modal pseudo-siamese network for patient trial matching', 'Link': 'https://dl.acm.org/doi/abs/10.1145/3394486.3403123', 'Abstract'