In [29]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import re
import pandas as pd
import numpy as np

In [30]:
def get_papers_pages(keywords, num_pages=20):

    # base URL
    base_url = "https://www.oalib.com/search"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    pages = []  
    for page_no in range(1, num_pages + 1):
        # params
        query_params = {
            "type": "0",
            "oldType": "0",
            "kw": keywords,
            "searchField": "All",
            "__multiselect_searchField": "",
            "fromYear": "",
            "toYear": "",
            "pageNo": page_no,
        }
        
        # request
        response = requests.get(base_url, params=query_params, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page_no}, status code: {response.status_code}")
            continue
        
        try:
            soup = BeautifulSoup(response.content, 'lxml')
            pages.append(soup)
            print(f"Page {page_no} fetched successfully.")
        except Exception as e:
            print(f"An error occurred on page {page_no}: {e}")
    
    return pages



In [31]:
pages = get_papers_pages("Children Mental Health Cause", num_pages=100)
print(f"Total pages fetched: {len(pages)}")

Page 1 fetched successfully.
Page 2 fetched successfully.
Page 3 fetched successfully.
Page 4 fetched successfully.
Page 5 fetched successfully.
Page 6 fetched successfully.
Page 7 fetched successfully.
Page 8 fetched successfully.
Page 9 fetched successfully.
Page 10 fetched successfully.
Page 11 fetched successfully.
Page 12 fetched successfully.
Page 13 fetched successfully.
Page 14 fetched successfully.
Page 15 fetched successfully.
Page 16 fetched successfully.
Page 17 fetched successfully.
Page 18 fetched successfully.
Page 19 fetched successfully.
Page 20 fetched successfully.
Page 21 fetched successfully.
Page 22 fetched successfully.
Page 23 fetched successfully.
Page 24 fetched successfully.
Page 25 fetched successfully.
Page 26 fetched successfully.
Page 27 fetched successfully.
Page 28 fetched successfully.
Page 29 fetched successfully.
Page 30 fetched successfully.
Page 31 fetched successfully.
Page 32 fetched successfully.
Page 33 fetched successfully.
Page 34 fetched suc

In [32]:
links = []
for page_index, page in enumerate(pages):
    print(f"Processing page {page_index + 1}...")
    
    # find all <a> 
    a_tags = page.find_all('a', href=True)  
    
    for a_tag in a_tags:
        href = a_tag['href']
        # Filter links containing '/paper/'
        if "/paper/" in href and href.startswith("//") and "journalID" not in href:
            full_link = "https:" + href
            links.append(full_link)

print(f"Total links extracted: {len(links)}")

# check code
if links:
    print(f"Example link: {links[0]}")
else:
    print("No links extracted. Please check the HTML structure or search results.")


Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Processing page 26...
Processing page 27...
Processing page 28...
Processing page 29...
Processing page 30...
Processing page 31...
Processing page 32...
Processing page 33...
Processing page 34...
Processing page 35...
Processing page 36...
Processing page 37...
Processing page 38...
Processing page 39...
Processing page 40...
Processing page 41...
Processing page 42...
Processing page 43...
Processing page 44...
Processing page 45...
Processing page 46.

In [33]:
links

['https://www.oalib.com/paper/2985608',
 'https://www.oalib.com/paper/3147510',
 'https://www.oalib.com/paper/3149804',
 'https://www.oalib.com/paper/2989785',
 'https://www.oalib.com/paper/6761171',
 'https://www.oalib.com/paper/2985517',
 'https://www.oalib.com/paper/5302203',
 'https://www.oalib.com/paper/5296887',
 'https://www.oalib.com/paper/2990618',
 'https://www.oalib.com/paper/2993913',
 'https://www.oalib.com/paper/283534',
 'https://www.oalib.com/paper/3110970',
 'https://www.oalib.com/paper/3153612',
 'https://www.oalib.com/paper/13800',
 'https://www.oalib.com/paper/2984491',
 'https://www.oalib.com/paper/2990251',
 'https://www.oalib.com/paper/5424231',
 'https://www.oalib.com/paper/3143915',
 'https://www.oalib.com/paper/2663751',
 'https://www.oalib.com/paper/3662033',
 'https://www.oalib.com/paper/2993914',
 'https://www.oalib.com/paper/5277060',
 'https://www.oalib.com/paper/5280208',
 'https://www.oalib.com/paper/2990672',
 'https://www.oalib.com/paper/6119489',
 'h

In [34]:
# # Traverse the list of links and analyze the keywords of each link.
def extract_keywords(html_content):

    soup = BeautifulSoup(html_content, 'lxml')

    # locate <p class="keyword"> 
    keywords_section = soup.find('p', class_='keyword')
    if not keywords_section:
        print("Keywords section not found.")
        return []

    # extract all content in <a> label
    keywords = []
    a_tags = keywords_section.find_all('a', href=True)
    for a_tag in a_tags:
        keyword = a_tag.get_text(strip=True)
        keywords.append(keyword)

    return keywords

In [35]:
# extract our abstract of each essay
def extract_abstract(html_content):
    soup = BeautifulSoup(html_content, 'lxml')

    # locate Abstract 
    abstract_section = soup.find('div', style=lambda value: value and 'line-height:21px;font-size:11pt' in value)
    if not abstract_section:
        print("Abstract section not found.")
        return "Abstract not found."

    # extract abstract text
    paragraphs = abstract_section.find_all('p')
    abstract = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
    processed_abstract = re.sub(r'\n', ' ', abstract)  
    processed_abstract = re.sub(r'\s+', ' ', processed_abstract)  
    processed_abstract = processed_abstract.strip()  

    return processed_abstract if processed_abstract else "Abstract not found."

# test our code
html_content = requests.get('https://www.oalib.com/research/2990251').content
abstract = extract_abstract(html_content)
print("Extracted Abstract:")
print(abstract)


Extracted Abstract:
Type I Diabetes Mellitus (DM I) is the third most common chronic childhood disease and can cause both short-term and long-term complications, as well as acute life-threatening events. The announcement of the DM I diagnosis in childhood or adolescence constitutes a major psychosocial stressor for the child and his family. Diabetes in general complicates the parent-child relationship and increases adolescents’negative thoughts about themselves and depressive mood. The majority of patients experience depressive and/or anxiety symptoms at thetime of diagnosis, which generally resolve withinsix to nine months. Poor adjustment in this initial phase places adolescents at risk for later psychosocial difficulties. On a long-term basis, individuals with DM I may exhibit significant psychiatric and behavioral problems including depression, anxiety and anger. We here reported the case of a 13-year-old boy, suffering from DM I for 3 years, who developed poor metabolic control, d

In [None]:
def filter_links_by_keywords(links):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # condition: keywords include "Mental Health" and "Children/Adolescent/Teenage"
    primary_keyword = "Mental Health"
    related_keywords = ["Children", "Adolescent", "Teenage"]

    filtered_links = []
    data = []

    for link in links:
        try:
            response = requests.get(link, headers=headers)
            if response.status_code != 200:
                print(f"Failed to fetch link: {link}, status code: {response.status_code}")
                continue

            # extract keywords via calling extract_keywords function
            html_content = response.text
            keywords = extract_keywords(html_content)
            my_abstract = extract_abstract(html_content)

            # check whether the keywords match the condition
            if (
                any(primary_keyword in keyword for keyword in keywords) and
                any(related in keyword for related in related_keywords for keyword in keywords)
            ):
                filtered_links.append(link)
                print(f"Link added: {link} (Keywords: {keywords})")
                data.append({
                    "Link": link,
                    "Keywords": ", ".join(keywords),
                    "Abstract": my_abstract
                })
            else:
                print(f"Link skipped: {link} (Keywords: {keywords})")

        except Exception as e:
            print(f"An error occurred while processing link: {link}, error: {e}")

    df = pd.DataFrame(data)

    # save to csv file
    output_path = os.path.join("..", "data", "processed", "filtered_essays.csv")
    df.to_csv(output_path, index=False, encoding='utf-8')
    print("CSV file has been saved as 'filtered_essays.csv'")

    return filtered_links

In [37]:
filtered_links = filter_links_by_keywords(links)

Link skipped: https://www.oalib.com/paper/2985608 (Keywords: ['Depression', 'Mental Health', 'HIV AIDS', 'South Africa'])
Link added: https://www.oalib.com/paper/3147510 (Keywords: ['Child Care', 'Children', 'Adolescent', 'Mental Health', 'Nursing'])
Link added: https://www.oalib.com/paper/3149804 (Keywords: ['Disabled Children', 'Mental Health Problems', 'Quality of Life', 'Gaza Strip'])
Link skipped: https://www.oalib.com/paper/2989785 (Keywords: ['Mental Health', 'Internalizing Symptoms', 'Adolescence', 'Middle Adulthood'])
Link skipped: https://www.oalib.com/paper/6761171 (Keywords: ['Paternity', 'Fatherhood', 'Paternity Support', 'Paternal Mental Health', 'Difficulties with Fatherhood'])
Link skipped: https://www.oalib.com/paper/2985517 (Keywords: ['BRFSS Surveillance Data', 'Rural Mental Health', 'Perceived Stigma of Mental Health Issues'])
Link skipped: https://www.oalib.com/paper/5302203 (Keywords: ['Low Back Pain', 'Mental Health', 'Pregnancy', 'Prospective Study'])
Link skipp