In [1]:
import requests
from bs4 import BeautifulSoup

In [8]:
# Step 1: Define the URL of the faculty directory
url = "https://www.cc.gatech.edu/people/faculty"

# Step 2: Send an HTTP GET request to fetch the page content
response = requests.get(url)

# Step 3: Load the response into BeautifulSoup for parsing
soup = BeautifulSoup(response.text, 'html.parser')

# Step 4: Print the HTML title to make sure we loaded the page correctly
print(soup.title.text)


Faculty | College of Computing


In [30]:
faculty_data = []

faculty_rows = soup.find_all("div", class_="views-row")

for row in faculty_rows:
    # Step 1: Inside the row, find both column divs
    columns = row.find_all("div", class_="views-col")

    for col in columns:
        # Step 2: In each column, find the name/link <a> tag
        name_tag = col.find("h4", class_="card-block__title")
        if name_tag and name_tag.a:
            name = name_tag.a.text.strip()
            relative_url = name_tag.a['href']
            full_url = "https://www.cc.gatech.edu" + relative_url
            faculty_data.append({"name": name, "url": full_url})

  # Convert to pandas DataFrame
df = pd.DataFrame(faculty_data)
df

Unnamed: 0,name,url
0,Abrahim Ladha,https://www.cc.gatech.edu/people/abrahim-ladha
1,Jacob Abernethy,https://www.cc.gatech.edu/people/jacob-abernethy
2,Gregory Abowd,https://www.cc.gatech.edu/people/gregory-abowd-0
3,Alexander T Adams,https://www.cc.gatech.edu/people/alexander-t-a...
4,Keith Adkins,https://www.cc.gatech.edu/people/keith-adkins
5,Mustaque Ahamad,https://www.cc.gatech.edu/people/mustaque-ahamad
6,Srinivas Aluru,https://www.cc.gatech.edu/people/srinivas-aluru
7,Mostafa Ammar,https://www.cc.gatech.edu/people/mostafa-ammar
8,Clio Andris,https://www.cc.gatech.edu/people/clio-andris
9,Annie Antón,https://www.cc.gatech.edu/people/annie-anton


In [28]:
import pandas as pd  # Only needs to be run once

# Initialize an empty list to store the faculty info
faculty_data = []

# Loop through all faculty entries
for div in faculty_rows:
    link_tag = div.find("a")

    if link_tag:
        name = link_tag.text.strip()
        relative_url = link_tag['href']
        full_url = "https://www.cc.gatech.edu" + relative_url

        faculty_data.append({
            "name": name,
            "page_url": full_url
        })

# Convert to pandas DataFrame
df = pd.DataFrame(faculty_data)

# Show the first few rows to verify
df

Unnamed: 0,name,page_url
0,Abrahim Ladha,https://www.cc.gatech.edu/people/abrahim-ladha
1,Gregory Abowd,https://www.cc.gatech.edu/people/gregory-abowd-0
2,Keith Adkins,https://www.cc.gatech.edu/people/keith-adkins
3,Srinivas Aluru,https://www.cc.gatech.edu/people/srinivas-aluru
4,Clio Andris,https://www.cc.gatech.edu/people/clio-andris
5,Alberto Apostolico (1948-2015),https://www.cc.gatech.edu/people/alberto-apost...


In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL and page range
base_url = "https://www.cc.gatech.edu/people/faculty?page="
all_faculty = []

# Loop through all 24 pages (0 through 23)
for page_num in range(24):
    print(f"Fetching page {page_num}...")
    res = requests.get(base_url + str(page_num))
    soup = BeautifulSoup(res.text, 'html.parser')

    # Each faculty row has two columns
    rows = soup.find_all("div", class_="views-row")
    for row in rows:
        columns = row.find_all("div", class_="views-col")
        for col in columns:
            name_tag = col.find("h4", class_="card-block__title")
            if name_tag and name_tag.a:
                name = name_tag.a.text.strip()
                relative_url = name_tag.a['href']
                full_url = "https://www.cc.gatech.edu" + relative_url
                all_faculty.append({"name": name, "url": full_url})

# Store in a DataFrame
faculty_df = pd.DataFrame(all_faculty)

# Show number of people and first few rows
print(f"✅ Total faculty collected: {len(faculty_df)}")
faculty_df.head()


Fetching page 0...
Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
✅ Total faculty collected: 280


Unnamed: 0,name,url
0,Abrahim Ladha,https://www.cc.gatech.edu/people/abrahim-ladha
1,Jacob Abernethy,https://www.cc.gatech.edu/people/jacob-abernethy
2,Gregory Abowd,https://www.cc.gatech.edu/people/gregory-abowd-0
3,Alexander T Adams,https://www.cc.gatech.edu/people/alexander-t-a...
4,Keith Adkins,https://www.cc.gatech.edu/people/keith-adkins


In [35]:
import re
import time

def extract_email_from_url(url):
    try:
        res = requests.get(url, timeout=5)
        soup = BeautifulSoup(res.text, 'html.parser')

        # Look for mailto: links
        for a in soup.find_all('a', href=True):
            if a['href'].startswith('mailto:'):
                email = a['href'].replace('mailto:', '').strip()
                if not any(bad in email for bad in ["web@", "info@", "contact@", "communications@"]):
                    return email

        return None

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Only try the first 5 as a preview (increase later)
faculty_df["email"] = None

for idx, row in faculty_df.iterrows():
    email = extract_email_from_url(row["url"])
    faculty_df.at[idx, "email"] = email

    if idx % 20 == 0:
        print(f"[{idx}/{len(faculty_df)}] Processed: {row['name']} → {email}")
        time.sleep(1)  # optional: be polite to the server

print("✅ Finished email extraction.")
faculty_df.head(10)


[0/280] Processed: Abrahim Ladha → None
[20/280] Processed: Dhruv Batra → dbatra@gatech.edu
[40/280] Processed: Zongchen Chen → chenzongchen@gatech.edu
[60/280] Processed: Martin Davis, Jr. → mdavis@cc.gatech.edu
[80/280] Processed: Robin Fievet → None
[100/280] Processed: Matthew Gombolay → matthew.gombolay@cc.gatech.edu
[120/280] Processed: Ayanna Howard → None
[140/280] Processed: Vladimir Kolesnikov → kolesnikov@gatech.edu
[160/280] Processed: Weiching Ma → None
[180/280] Processed: Elizabeth Mynatt → mynatt@cc.gatech.edu
[200/280] Processed: Will Perkins → wperkins3@gatech.edu
[220/280] Processed: Jessica Roberts → jessica.roberts@cc.gatech.edu
[240/280] Processed: Mani Subramanian → None
[260/280] Processed: Marilyn Wolf → None
✅ Finished email extraction.


Unnamed: 0,name,url,email
0,Abrahim Ladha,https://www.cc.gatech.edu/people/abrahim-ladha,
1,Jacob Abernethy,https://www.cc.gatech.edu/people/jacob-abernethy,prof@gatech.edu
2,Gregory Abowd,https://www.cc.gatech.edu/people/gregory-abowd-0,
3,Alexander T Adams,https://www.cc.gatech.edu/people/alexander-t-a...,aadams322@gatech.edu
4,Keith Adkins,https://www.cc.gatech.edu/people/keith-adkins,keith.adkins@gatech.edu
5,Mustaque Ahamad,https://www.cc.gatech.edu/people/mustaque-ahamad,mustaque.ahamad@cc.gatech.edu
6,Srinivas Aluru,https://www.cc.gatech.edu/people/srinivas-aluru,aluru@cc.gatech.edu
7,Mostafa Ammar,https://www.cc.gatech.edu/people/mostafa-ammar,ammar@cc.gatech.edu
8,Clio Andris,https://www.cc.gatech.edu/people/clio-andris,clio@gatech.edu
9,Annie Antón,https://www.cc.gatech.edu/people/annie-anton,aa16@gatech.edu


In [36]:
num_with_emails = faculty_df["email"].notna().sum()
print(f"✅ Found {num_with_emails} faculty with emails out of {len(faculty_df)} total.")

✅ Found 199 faculty with emails out of 280 total.


In [37]:
faculty_df.to_csv("gt_faculty_emails.csv", index=False)
print("✅ Exported to gt_faculty_emails.csv")

✅ Exported to gt_faculty_emails.csv


In [55]:
# Define research keywords
keywords = [
    # General ML / DS
    "machine learning", "data science", "unsupervised learning",
    "supervised learning", "classification", "regression",
    "clustering", "dimensionality reduction", "feature extraction",
    "similarity analysis", "embedding space",

    # NLP
    "natural language processing", "nlp", "sentence embeddings",
    "text mining", "topic modeling", "language models",
    "transformer model", " bert ", "huggingface",

    # Tools / Libraries
    "hdbscan", "umap", "scikit-learn", "pytorch", "tensorflow",
    "sentence-transformers", "streamlit", "python", "sklearn"
]

def matched_keywords_in_profile(url, keywords):
    try:
        res = requests.get(url, timeout=5)
        text = res.text.lower()

        matched = [kw for kw in keywords if kw.lower() in text]
        return matched  # could be an empty list

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

faculty_df["matched_keywords"] = None  # Initialize column

for idx, row in faculty_df.iterrows():
    if pd.notna(row["email"]):
        matches = matched_keywords_in_profile(row["url"], keywords)
        faculty_df.at[idx, "matched_keywords"] = matches

        if idx % 10 == 0:
            print(f"[{idx}] {row['name']} → {len(matches)} keyword(s) matched: {matches}")



[20] Dhruv Batra → 3 keyword(s) matched: ['machine learning', 'natural language processing', 'nlp']
[40] Zongchen Chen → 0 keyword(s) matched: []
[60] Martin Davis, Jr. → 0 keyword(s) matched: []
[70] Constantine Dovrolis → 1 keyword(s) matched: ['machine learning']
[90] Merrick Furst → 0 keyword(s) matched: []
[100] Matthew Gombolay → 0 keyword(s) matched: []
[110] Aaron Hansen → 0 keyword(s) matched: []
[130] Joseph Jaeger → 0 keyword(s) matched: []
[140] Vladimir Kolesnikov → 0 keyword(s) matched: []
[170] Kuldeep S. Meel → 0 keyword(s) matched: []
[180] Elizabeth Mynatt → 0 keyword(s) matched: []
[190] Devi Parikh → 1 keyword(s) matched: ['natural language processing']
[200] Will Perkins → 0 keyword(s) matched: []
[210] Milos Prvulovic → 0 keyword(s) matched: []
[220] Jessica Roberts → 0 keyword(s) matched: []
[230] Ryan Shandler → 0 keyword(s) matched: []
[250] Jan van den Brand → 0 keyword(s) matched: []


In [60]:
faculty_df["relevant"] = faculty_df["matched_keywords"].apply(
    lambda x: len(x) > 0 if isinstance(x, list) else False
)


In [64]:

# Filter to show only relevant researchers
relevant_df = faculty_df[faculty_df["relevant"] == True]
print(f"✅ {len(relevant_df)} faculty matched your research interests.")

# Export to CSV
relevant_df.to_csv("gt_relevant_faculty.csv", index=False)
print("✅ Exported to gt_relevant_faculty.csv")


✅ 54 faculty matched your research interests.
✅ Exported to gt_relevant_faculty.csv


In [None]:
import smtplib
from email.message import EmailMessage
import ssl
import time

# Load your filtered DataFrame
df = relevant_df  # Assumes DataFrame is already filtered

# Your email credentials (use environment variables or config file in practice)
sender_email = "your_email@example.com"
app_password = "your_app_password_here"  # Use an app password or OAuth2 token
subject = "Research Opportunity Inquiry (Fall 2025)"

# Load resume file
with open("YourResume.pdf", "rb") as f:
    resume_bytes = f.read()

# Loop over recipients
for idx, row in df.iterrows():
    recipient = row["email"]
    first_name = row["name"].split()[0]
    last_name = row["name"].split()[-1]
    keywords = ", ".join(row["matched_keywords"])

    # Customize message
    body = f"""Dear Professor {last_name},

I'm a CS student at Georgia Tech currently exploring research opportunities for Fall 2025. I came across your work involving {keywords} and found it highly aligned with my interests and experience.

I've attached my resume for your consideration, and I’d love the opportunity to contribute to your lab. Thank you for your time!

Best regards,
[Your Name]
"""

    # Create email
    msg = EmailMessage()
    msg["From"] = sender_email
    msg["To"] = recipient
    msg["Subject"] = subject
    msg.set_content(body)

    # Attach resume
    msg.add_attachment(resume_bytes, maintype='application', subtype='pdf', filename="YourResume.pdf")

    # Send email via Gmail
    context = ssl.create_default_context()
    with smtplib.SMTP_SSL("smtp.gmail.com", 465, context=context) as server:
        server.login(sender_email, app_password)
        server.send_message(msg)

    print(f"✅ Sent to {recipient}")
    time.sleep(2)  # Delay between sends
