In [None]:
!pip install scispacy spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz



In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz

In [None]:
import spacy
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_ner_bionlp13cg_md")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

In [None]:


# Load the biomedical NER model
nlp = spacy.load("en_ner_bionlp13cg_md")

# Test text with viruses and bacteria
test_text = "COVID-19 is caused by SARS-CoV-2. Studies also show interactions with Escherichia coli and Staphylococcus aureus."

# Process text
doc = nlp(test_text)

# Extract entities
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("Extracted Entities:", entities)


In [None]:
import spacy
import requests
from Bio import Entrez
import time
import re


def clean_text(text):
    """Removes XML tags and extra spaces from text."""
    return re.sub(r"<[^>]+>", " ", text).strip()


# Load SciSpaCy model for biomedical entity recognition
print("Loading SciSpaCy model...")
nlp = spacy.load("en_ner_bionlp13cg_md")

def extract_entities(text):
    """Extract only species-related entities (including viruses) using SciSpaCy."""
    doc = nlp(text)
    organism_labels = {"SPECIES", "TAXON", "ORGANISM", "VIRUS"}  # Allowed labels
    entities = [ent.text for ent in doc.ents if ent.label_ in organism_labels]
    return entities

def get_papers_from_orcid(orcid: str):
    """Get papers for a researcher using their ORCID"""
    url = f"https://pub.orcid.org/v3.0/{orcid}/works"
    headers = {"Accept": "application/json"}

    response = requests.get(url, headers=headers)
    if not response.ok:
        print(f"Error fetching ORCID data: {response.status_code}")
        return []

    data = response.json()
    papers = []

    for work in data.get('group', []):
        work_summary = work['work-summary'][0]

        paper = {
            'title': work_summary.get('title', {}).get('title', {}).get('value', 'No title'),
            'year': work_summary.get('publication-date', {}).get('year', {}).get('value', ''),
            'doi': None
        }

        external_ids = work_summary.get('external-ids', {}).get('external-id', [])
        for ext_id in external_ids:
            if ext_id['external-id-type'] == 'doi':
                paper['doi'] = ext_id['external-id-value']

        papers.append(paper)

    return papers

def get_abstract_from_doi(doi):
    """Try to get abstract using Crossref"""
    try:
        response = requests.get(f"https://api.crossref.org/works/{doi}")
        if response.ok:
            data = response.json()
            return data['message'].get('abstract', '')
    except:
        return ''
    return ''

def get_taxid(organism_name):
    """Get TAXID for an organism name"""
    try:
        handle = Entrez.esearch(db="taxonomy", term=organism_name)
        record = Entrez.read(handle)
        handle.close()

        if record["Count"] != "0":
            taxid = record["IdList"][0]

            handle = Entrez.efetch(db="taxonomy", id=taxid)
            details = Entrez.read(handle)
            handle.close()

            if details:
                return {
                    'taxid': taxid,
                    'scientific_name': details[0].get('ScientificName', ''),
                    'rank': details[0].get('Rank', ''),
                    'division': details[0].get('Division', '')
                }
    except Exception as e:
        print(f"Error getting TAXID for {organism_name}: {str(e)}")
    return None





In [None]:
# Test entity extraction
test_text = "COVID-19 is caused by SARS-CoV-2. Studies also show interactions with E. coli and Staphylococcus aureus."
doc = nlp(test_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("Extracted Entities:", entities)

In [None]:
# Run analysis
orcid = "0000-0002-7115-407X"  # Replace with your ORCID
Entrez.email = "hanna.palya@warwick.ac.uk"  # Replace with your email

# Get papers
print(f"Fetching papers for ORCID: {orcid}")
papers = get_papers_from_orcid(orcid)
print(f"Found {len(papers)} papers")

# Store found organisms and their TAXIDs
found_organisms = {}

# Process each paper
for paper in papers:
    print(f"\nAnalyzing paper: {paper['title']}")

    # Check text content
    text = paper['title']
    if paper['doi']:
        abstract = get_abstract_from_doi(paper['doi'])
        if abstract:
            text += " " + abstract

    print(f"Text for extraction: {text}")  # Debugging print

    text = clean_text(text)  # Clean before passing to extraction
    # Extract entities using SciSpaCy
    entities = extract_entities(text)

    print(f"Entities found: {entities}")  # Debugging print

    for entity in entities:
        print(f"Found organism: {entity}")

    # Look up each entity in taxonomy database
    for entity in entities:
        print(f"Found organism: {entity}")
        if entity not in found_organisms:
            tax_info = get_taxid(entity)
            if tax_info:
                found_organisms[entity] = tax_info
                print(f"Found organism: {entity}")
                print(f"TAXID: {tax_info['taxid']}")
                print(f"Scientific name: {tax_info['scientific_name']}")
                print("-" * 40)
        time.sleep(0.5)  # Be nice to NCBI servers

print("\nSummary of all organisms found:")
for org_name, info in found_organisms.items():
    print(f"\nOrganism: {org_name}")
    print(f"TAXID: {info['taxid']}")
    print(f"Scientific name: {info['scientific_name']}")