<a href="https://colab.research.google.com/github/asundar0128/ResearchPapersQuery/blob/main/PubMedBiopharmaFiltering_Abhinit_Sundar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!curl -sSL https://install.python-poetry.org | python3 -
import os
os.environ["PATH"] += ":/root/.local/bin"

[36mRetrieving Poetry metadata[0m

# Welcome to [36mPoetry[0m!

This will download and install the latest version of [36mPoetry[0m,
a dependency and package manager for Python.

It will add the `poetry` command to [36mPoetry[0m's bin directory, located at:

[33m/root/.local/bin[0m

You can uninstall at any time by executing this script with the --uninstall option,
and these changes will be reverted.

Installing [36mPoetry[0m ([36m2.1.3[0m)
[1A[2KInstalling [36mPoetry[0m ([1m2.1.3[0m): [33mCreating environment[0m
[1A[2KInstalling [36mPoetry[0m ([1m2.1.3[0m): [33mInstalling Poetry[0m
[1A[2KInstalling [36mPoetry[0m ([1m2.1.3[0m): [33mCreating script[0m
[1A[2KInstalling [36mPoetry[0m ([1m2.1.3[0m): [33mDone[0m

[36mPoetry[0m ([1m2.1.3[0m) is installed now. Great!

To get started you need [36mPoetry[0m's bin directory ([33m/root/.local/bin[0m) in your `PATH`
environment variable.

Add `export PATH="[33m/root/.local/bin[0m:$PATH"` to yo

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
!poetry new pubmed_fetcher_project --src
%cd pubmed_fetcher_project

The [36m--src[39m option is now the default and will be removed in a future version.
Created package [34mpubmed_fetcher_project[39m in [34mpubmed_fetcher_project[39m
/content/pubmed_fetcher_project


In [None]:
# list of dependencies for Poetry Package
PyProjectPoetryDependency = """
[tool.poetry]
name = "pubmed-fetcher"
version = "0.1.0"
description = "Fetch PubMed papers with biotech/pharma affiliations using typed, robust, and modular Python"
authors = ["Abhinit Sundar <aks94@njit.edu>"]
packages = [{ include = "pubmed_fetcher" }]

[tool.poetry.dependencies]
python = "^3.9"
biopython = "^1.83"
pandas = "^2.2.2"
tqdm = "^4.66.4"

[tool.poetry.scripts]
get-papers-list = "pubmed_fetcher.cli:main"
"""
with open("pyproject.toml", "w") as f:
    f.write(PyProjectPoetryDependency)

In [None]:
# creating a new directory pubmed_fetcher
import os
os.makedirs("pubmed_fetcher", exist_ok=True)
with open("pubmed_fetcher/__init__.py", "w") as f:
    f.write("")

In [None]:
# importing packages typing, re, pandas, tqdm, and Bio
from typing import List, Dict
import re
import pandas as pd
from tqdm import tqdm
from Bio import Entrez, Medline

# Specifying my university email address for Entrez package
Entrez.email = "aks94@njit.edu"

# List of corporate keywords
corporateKeywords = [
    "pharma", "pharmaceutical", "biotech", "biotechnology",
    "inc", "ltd", "llc", "gmbh", "corp", "co.", "plc"
]

# List of institutional keywords
institutionalKeywords = [
    "university", "université", "universität", "institute", "school",
    "college", "department", "faculty", "hospital", "centre", "center",
    "clinic", "foundation", "research center", "research institute"
]

# Finding keywords for company affiliation for both corporate and institutional
def companyAffiliation(generatedAffiliation: str) -> bool:
    lowercaseAffiliation = generatedAffiliation.lower()
    return (
        any(keyword in lowercaseAffiliation for keyword in corporateKeywords) and
        not any(keyword in lowercaseAffiliation for keyword in institutionalKeywords)
    )

# Retrieving author's corresponding email address from record with each generated field data
def retrievalEmailsFromRecord(generatedRecord: Dict, debug: bool = False) -> List[str]:
    generatedEmailSet = set()
    fieldsToCheck = ["AD", "IR", "AID", "FAU"]

# checking for field data with generated record and field
    for field in fieldsToCheck:
        generatedFieldData = generatedRecord.get(field, [])
        if isinstance(generatedFieldData, str):
            generatedFieldData = [generatedFieldData]

# checking for matched emails with regex and adding emails to the email set
        for line in generatedFieldData:
            matchedEmails = re.findall(r"[\w\.-]+@[\w\.-]+\.\w+", line)
            if matchedEmails:
                for email in matchedEmails:
                    if "noreply" not in email.lower() and not email.lower().endswith("@example.com"):
                        generatedEmailSet.add(email)
                        if debug:
                            print(f"[DEBUG] Found email in field '{field}': {email}")

    return list(generatedEmailSet)

# Using esearch and read functions from Entrez within the pubmed database with a query and specified maximum results
def search_pubmed(generatedQuery: str, generatedMaximumResults: int = 200) -> List[str]:
    generatedHandle = Entrez.esearch(db="pubmed", term=generatedQuery, retmax=generatedMaximumResults)
    generatedRecord = Entrez.read(generatedHandle)
    generatedHandle.close()
    return generatedRecord["IdList"]

# Fetching the details from PubMedIDS with efetch function from Entrez for PubMed database
def fetch_details(generatedPubMedIDS: List[str]) -> List[Dict]:
    generatedHandle = Entrez.efetch(
        db="pubmed",
        id=",".join(generatedPubMedIDS),
        rettype="medline",
        retmode="text"
    )
    generatedRecords = list(Medline.parse(generatedHandle))
    generatedHandle.close()
    return generatedRecords

# Processing records with PMID, Title, Publication Date, Authors, and Affiliations
def process_records(generatedRecords: List[Dict], debug: bool = False) -> pd.DataFrame:
    generatedResults = []

# using tqdm with generatedRecords and Processing
    for generatedRecord in tqdm(generatedRecords, desc="Processing"):
        try:
          # retrieving PMID, TI, DP, FAU, and AD for the record list
            generatedPMID = generatedRecord.get("PMID", "")
            generatedTitle = generatedRecord.get("TI", "")
            generatedPublicationDateRaw = generatedRecord.get("DP", "")
            generatedAuthors = generatedRecord.get("FAU", [])
            generatedAffiliations = generatedRecord.get("AD", [])

            if isinstance(generatedAffiliations, str):
                generatedAffiliations = [generatedAffiliations]
# initializing non academic authors and company affiliations
            generatedNonAcademicAuthors = []
            generatedCompanyAffiliations = []

            for generatedAffiliation in generatedAffiliations:
                if debug:
                    print(f"[DEBUG] Affiliation: {generatedAffiliation}")

                if companyAffiliation(generatedAffiliation):
                    generatedCompanyAffiliations.append(generatedAffiliation)

            if generatedCompanyAffiliations and generatedAuthors:
                generatedNonAcademicAuthors = generatedAuthors[:len(generatedCompanyAffiliations)]

            generatedEmailList = retrievalEmailsFromRecord(generatedRecord, debug)
            generatedCorrespondingEmail = generatedEmailList[0] if generatedEmailList else ""

 # retrieving the publication year with a regex search command
            generatedPublicationYear = ""
            yearMatch = re.search(r"\b(19|20)\d{2}\b", generatedPublicationDateRaw)
            if yearMatch:
                generatedPublicationYear = yearMatch.group(0)

            if generatedCompanyAffiliations:
                generatedResults.append({
                    "PubmedID": generatedPMID,
                    "Title": generatedTitle,
                    "Publication Date": generatedPublicationYear,
                    "Non-academic Author(s)": "; ".join(generatedNonAcademicAuthors),
                    "Company Affiliation(s)": "; ".join(generatedCompanyAffiliations),
                    "Corresponding Author Email": generatedCorrespondingEmail,
                    "Raw Publication Date": generatedPublicationDateRaw
                })

# exception handling for skipping records due to error
        except Exception as generatedException:
            if debug:
                print(f"[WARN] Skipping record due to error: {generatedException}")
            continue

    return pd.DataFrame(generatedResults)

In [None]:
# command line interface parsing logic for supporting a user-defined PubMed query string
generatedCLICode = """
import argparse
import sys
from pubmed_fetcher.core import search_pubmed, fetch_details, process_records

def main():
    generatedCommandLineParser = argparse.ArgumentParser(
        description="Fetch PubMed papers with biotech/pharma affiliations"
    )

    generatedCommandLineParser.add_argument(
        "query",
        type=str,
        help="User-defined PubMed query string (supports full PubMed syntax)"
    )

    generatedCommandLineParser.add_argument(
        "-d", "--debug",
        action="store_true",
        help="Enable debug logging during execution"
    )

    generatedCommandLineParser.add_argument(
        "-f", "--file",
        type=str,
        help="Optional CSV output filename (if omitted, prints to standard output)"
    )

    generatedParsedArguments = generatedCommandLineParser.parse_args()

    try:
        generatedPubmedIDList = search_pubmed(generatedParsedArguments.query)

        if not generatedPubmedIDList:
            print("[INFO] No results found for the provided PubMed query.")
            return

        generatedPublicationRecords = fetch_details(generatedPubmedIDList)
        generatedProcessedDataFrame = process_records(
            generatedPublicationRecords,
            debug=generatedParsedArguments.debug
        )

        if generatedProcessedDataFrame.empty:
            print("[INFO] No papers matched the biotech/pharma affiliation criteria.")
            return

        if generatedParsedArguments.file:
            generatedProcessedDataFrame.to_csv(generatedParsedArguments.file, index=False)
            print(f"[INFO] Results saved to: {generatedParsedArguments.file}")
        else:
            print(generatedProcessedDataFrame.to_csv(index=False))

    except Exception as generatedError:
        print(f"[ERROR] An unexpected error occurred: {generatedError}", file=sys.stderr)
        sys.exit(1)
"""
with open("pubmed_fetcher/cli.py", "w") as f:
    f.write(generatedCLICode)

In [None]:
!poetry install

[34mInstalling dependencies from lock file[39m

No dependencies to install or update

[39;1mInstalling[39;22m the current project: [36mpubmed-fetcher[39m ([39;1m0.1.0[39;22m)[1G[2K[39;1mInstalling[39;22m the current project: [36mpubmed-fetcher[39m ([32m0.1.0[39m)


In [None]:
!poetry run get-papers-list "(pharmaceutical OR biotech OR biotechnology OR pharma OR drug development OR biopharmaceutical OR life sciences OR medical company) AND (affiliation[Affiliation])" -f filteredPubMedArticles.csv

Processing:   0% 0/200 [00:00<?, ?it/s]Processing: 100% 200/200 [00:00<00:00, 22899.05it/s]
[INFO] Results saved to: filteredPubMedArticles.csv


In [27]:
!poetry run pytest

platform linux -- Python 3.11.13, pytest-8.3.5, pluggy-1.6.0
rootdir: /content/pubmed_fetcher_project
configfile: pyproject.toml
plugins: anyio-4.9.0, langsmith-0.4.4, typeguard-4.4.4
[1mcollecting ... [0m[1mcollected 0 items                                                              [0m

