In [1]:
pip install requests beautifulsoup4 serpapi biopython

Collecting serpapi
  Downloading serpapi-0.1.5-py2.py3-none-any.whl.metadata (10 kB)
Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading serpapi-0.1.5-py2.py3-none-any.whl (10 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython, serpapi
Successfully installed biopython-1.86 serpapi-0.1.5


In [3]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import pandas as pd
import re

INPUT_PATH = "/content/drive/MyDrive/neurologistsFileSlightlyBigger.csv"
OUTPUT_PATH = "/content/drive/MyDrive/cleaned_neurologists2.csv"

df = pd.read_csv(INPUT_PATH)

TEXT_COLS = ["First name", "Job position", "Company name", "Industry", "LinkedIn"]

for col in TEXT_COLS:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.lower()
            .str.strip()
        )

SPECIALTY_KEYWORDS = {
    "cognitive_neurology": ["cognitive", "memory", "dementia"],
    "neuro_ophthalmology": ["ophthalmology", "vision", "eye"],
    "alzheimers": ["alzheimer"],
    "movement_disorders": ["movement", "parkinson"],
    "research": ["research", "scientist", "professor"]
}

def extract_specialties(title):
    found = []
    for specialty, keywords in SPECIALTY_KEYWORDS.items():
        for kw in keywords:
            if kw in title:
                found.append(specialty)
                break
    return ",".join(found)

df["specialties"] = df["Job position"].apply(extract_specialties)

VAGUE_TITLES = ["neurologist", "md", "physician"]

def needs_scraping(row):
    LinkedIn_missing = row.get("LinkedIn", "") in ["", "nan"]
    vague_title = row.get("Job position", "") in VAGUE_TITLES
    Industry_blank = row.get("Industry", "") in ["", "nan"]
    return LinkedIn_missing or vague_title or Industry_blank

df["needs_scraping"] = df.apply(needs_scraping, axis=1)

df.to_csv(OUTPUT_PATH, index=False)

print("Cleaned data saved to cleaned_neurologists2.csv")


Cleaned data saved to cleaned_neurologists2.csv


In [None]:
from serpapi import search
import os

SERPAPI_KEY = "8c877b3b662e5dde56305c33b1b9b54bf59016ddf0b1f0b5551f2221ef7810c8"

def google_search(query):
    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_KEY,
        "num": 10
    }

    results = search(params)
    snippets = []

    for r in results.get("organic_results", []):
        snippets.append(r.get("snippet", "").lower())

    return " ".join(snippets)


def keyword_signal(text, keywords):
    return int(any(k in text for k in keywords))


from Bio import Entrez
import re

Entrez.email = "your_email@example.com"

def pubmed_alzheimers_count(author_name):
    query = f'{author_name}[Author] AND (Alzheimer OR dementia)'
    handle = Entrez.esearch(db="pubmed", term=query, retmax=50)
    record = Entrez.read(handle)
    return int(record["Count"])

def pubmed_recent_keywords(author_name):
    query = f'{author_name}[Author] AND ("2019"[PDAT] : "3000"[PDAT])'
    handle = Entrez.esearch(db="pubmed", term=query, retmax=10)
    ids = Entrez.read(handle)["IdList"]

    keywords = set()

    for pid in ids:
        fetch = Entrez.efetch(db="pubmed", id=pid, rettype="abstract", retmode="text")
        text = fetch.read().lower()
        for k in ["alzheimers", "dementia", "vr", "eye tracking", "hci"]:
            if k in text:
                keywords.add(k)

    return list(keywords)

In [None]:
import pandas as pd
import json
import random

INPUT_PATH = "/content/drive/MyDrive/cleaned_neurologists2.csv"
OUTPUT_PATH = "/content/drive/MyDrive/scraped_info.csv"

df = pd.read_csv(INPUT_PATH)

def scrape_signals(name, institution):
    query = f"{name} {institution} neurologist research"

    google_text = google_search(query)

    alz_papers = pubmed_alzheimers_count(name)
    recent_keywords = pubmed_recent_keywords(name)

    return {
        "mentions_alzheimers": keyword_signal(google_text, ["alzheimer"]),
        "mentions_dementia": keyword_signal(google_text, ["dementia"]),
        "cognitive_decline": keyword_signal(google_text, ["cognitive decline"]),
        "eye_tracking": keyword_signal(google_text, ["eye tracking"]),
        "vr_ar": keyword_signal(google_text, ["virtual reality", "vr", "augmented reality"]),
        "neuro_ophthalmology": keyword_signal(google_text, ["neuro-ophthalmology"]),
        "alzheimers_papers": alz_papers,
        "recent_keywords": recent_keywords
    }


scraped_rows = []

for _, row in df.iterrows():
    signals = scrape_signals(row["First name"], row["Company name"])
    scraped_rows.append({**row.to_dict(), **signals})

scraped_df = pd.DataFrame(scraped_rows)
scraped_df.to_csv(OUTPUT_PATH, index=False)

print("Scraped enrichment data saved to scraped_info.csv")


Scraped enrichment data saved to scraped_info.csv


In [None]:
import pandas as pd

INPUT_PATH = "/content/drive/MyDrive/scraped_info.csv"
TOP_100_PATH = "/content/drive/MyDrive/top_100_neurologists.csv"

df = pd.read_csv(INPUT_PATH)

def calculate_score(row):
    score = 0

    if row["mentions_alzheimers"]:
        score += 50
    if "cognitive_neurology" in str(row["specialties"]):
        score += 40
    if row["mentions_dementia"]:
        score += 30
    if row["eye_tracking"]:
        score += 25
    if row["vr_ar"]:
        score += 20
    if row["neuro_ophthalmology"]:
        score += 20

    papers = row["alzheimers_papers"]
    if papers >= 10:
        score += 50
    elif papers >= 5:
        score += 30

    keyword_matches = len(set(row["recent_keywords"].strip("[]").split(",")))
    if keyword_matches >= 3:
        score += 40

    return score

df["relevance_score"] = df.apply(calculate_score, axis=1)

top_100 = df.sort_values(
    by="relevance_score",
    ascending=False
).head(100)

top_100.to_csv(TOP_100_PATH, index=False)

print("Top 100 neurologists saved to top_100_neurologists.csv")


Top 100 neurologists saved to top_100_neurologists.csv
