<a href="https://colab.research.google.com/gist/adzuci/70b705482026a2f52be7b9de3e1a63bd/opt_internship_finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Internship & Early-Career Job Scraper (jobspy + Python)

This notebook scrapes internship / early-career job postings using **[jobspy](https://github.com/jobspy/jobspy)**
and lets you filter them using:

- A **visa/OPT-friendly heuristic filter** (optional toggle)
- Generic **include / exclude keyword filters**
- Simple scoring so you can sort by how well each posting matches your criteria

### Credits

- **Idea & inspiration:** Vinay Varshigan  
- **Debugging & troubleshooting support:** Anja Lee  

This notebook is intended to be useful for software engineers and students searching for their dream job.

### What this notebook does

1. Scrapes recent job postings from multiple sites (via `jobspy`).  
2. Normalizes them into a single `pandas` DataFrame.  
3. Filters for **internship-like** roles ("intern", "co-op").  
4. Optionally applies **visa/OPT-friendly filters** using keyword heuristics.  
5. Optionally applies generic **include / exclude keyword filters**.  
6. Computes a simple **match score** and exports results to CSV.

### How to use it

1. Run the `pip install` cell to install dependencies.  
2. Edit the **Search configuration** cell:
   - `SEARCH_TERMS`, `SITES`, `LOCATION`, `HOURS_OLD`, etc.
   - Set `USE_VISA_FILTERS = True` if you care about OPT/visa heuristics, or `False` if you don't.  
   - Optionally set `REQUIRED_KEYWORDS` and `BLOCKED_KEYWORDS` for your own filters.  
3. Run all cells top-to-bottom.  
4. Open the exported CSV (see the path printed near the end) in your spreadsheet tool of choice.

> ⚠️ **Disclaimer**  
> - This does **not** guarantee OPT eligibility or visa sponsorship.  
> - Always verify details on the company's careers page and with recruiters.  
> - Scraping may be subject to each site's Terms of Service; use responsibly.


In [None]:
!pip install -q python-jobspy pandas

In [None]:
import pandas as pd
from jobspy import scrape_jobs

def contains_any(text: str, keywords: list[str]) -> bool:
    """Return True if any keyword appears in the given text (case-insensitive)."""
    if text is None:
        return False
    text = text.lower()
    return any(k in text for k in keywords)

def score_text(text: str, good_keywords: list[str], bad_keywords: list[str]) -> int:
    """Compute a simple matching score: (# good hits) - (# bad hits)."""
    if text is None:
        text = ""
    text = text.lower()
    score = 0
    for k in good_keywords:
        score += text.count(k)
    for k in bad_keywords:
        score -= text.count(k)
    return score

In [None]:
# === Search configuration ===

SEARCH_TERMS = [
    "computer science intern",
    "software engineer intern",
    "software developer intern",
    "data science intern",
]

# jobspy-supported sites; you can add/remove depending on what works for you
SITES = [
    "indeed",
    "linkedin",
    # "zip_recruiter",
    # "glassdoor",
]

LOCATION = "United States"   # e.g. "United States", "Remote", "Boston, MA"
RESULTS_PER_SITE = 150       # number of results per site per search term
HOURS_OLD = 168              # limit to last 7 days (168 hours)

# === Visa / OPT-friendly keyword heuristics ===
# Set USE_VISA_FILTERS = False if you don't care about OPT/visa heuristics

USE_VISA_FILTERS = True

GOOD_KEYWORDS = [
    "opt",
    "cpt",
    "stem opt",
    "f1",
    "f-1",
    "visa sponsorship",
    "sponsorship available",
    "sponsor visas",
    "h-1b",
    "h1b",
    "international students",
]

BAD_KEYWORDS = [
    "us citizens only",
    "u.s. citizens only",
    "must be a us citizen",
    "citizen only",
    "no sponsorship",
    "cannot sponsor",
    "unable to sponsor",
    "not provide sponsorship",
    "gc or citizen only",
    "green card or citizen only",
]

# === Generic filters (always available) ===
# Use these for non-visa-specific use cases. Leave empty ([]) to ignore.

REQUIRED_KEYWORDS: list[str] = []  # e.g. ["machine learning", "python"]
BLOCKED_KEYWORDS: list[str] = []   # e.g. ["unpaid", "commission only"]

print("Configuration loaded.")

In [None]:
all_jobs = []

for site in SITES:
    for term in SEARCH_TERMS:
        print(f"Scraping {site} for '{term}' in {LOCATION} (last {HOURS_OLD} hours)...")
        try:
            jobs_df = scrape_jobs(
                site_name=site,
                search_term=term,
                location=LOCATION,
                results_wanted=RESULTS_PER_SITE,
                hours_old=HOURS_OLD,
                country_indeed="USA",  # relevant for Indeed
            )
            jobs_df["site"] = site
            jobs_df["search_term"] = term
            all_jobs.append(jobs_df)
            print(f"  -> Retrieved {len(jobs_df)} results.")
        except Exception as e:
            print(f"  !! Error scraping {site} for '{term}': {e}")

if not all_jobs:
    raise RuntimeError("No jobs retrieved. Try changing sites, search terms, or HOURS_OLD.")

raw_df = pd.concat(all_jobs, ignore_index=True)
print(f"\nTotal raw jobs collected: {len(raw_df)}")

raw_df.head()

In [None]:
df = raw_df.copy()

# Create a combined text field to search for keywords
description_col = "description" if "description" in df.columns else None
snippet_col = "snippet" if "snippet" in df.columns else None

text_parts = [df["title"].fillna("").astype(str)]
if description_col:
    text_parts.append(df[description_col].fillna("").astype(str))
if snippet_col:
    text_parts.append(df[snippet_col].fillna("").astype(str))

df["search_text"] = text_parts[0]
for part in text_parts[1:]:
    df["search_text"] = df["search_text"] + " " + part

df["search_text"] = df["search_text"].str.lower()

# Filter for internships explicitly (job title contains "intern" or "co-op")
intern_mask = df["title"].str.lower().str.contains("intern|co-op|co op", na=False)

df_interns = df[intern_mask].copy()
print(f"Internship-like roles: {len(df_interns)}")

df_interns[["title", "company", "location", "site"]].head(10)

In [None]:
# Apply visa/OPT heuristics (optional) and generic include/exclude filters

mask = pd.Series(True, index=df_interns.index)

# Visa / OPT-friendly filter (toggle)
if USE_VISA_FILTERS:
    visa_good_mask = df_interns["search_text"].apply(lambda t: contains_any(t, GOOD_KEYWORDS))
    visa_bad_mask = df_interns["search_text"].apply(lambda t: contains_any(t, BAD_KEYWORDS))
    mask &= visa_good_mask & ~visa_bad_mask

# Generic required keywords
if REQUIRED_KEYWORDS:
    required_mask = df_interns["search_text"].apply(lambda t: contains_any(t, REQUIRED_KEYWORDS))
    mask &= required_mask

# Generic blocked keywords
if BLOCKED_KEYWORDS:
    blocked_mask = df_interns["search_text"].apply(lambda t: contains_any(t, BLOCKED_KEYWORDS))
    mask &= ~blocked_mask

df_opt = df_interns[mask].copy()
print(f"Roles after filtering: {len(df_opt)}")

# Add a 'match_score' column for simple ranking
if USE_VISA_FILTERS:
    good_for_score = GOOD_KEYWORDS
    bad_for_score = BAD_KEYWORDS
else:
    good_for_score = REQUIRED_KEYWORDS
    bad_for_score = BLOCKED_KEYWORDS

if good_for_score or bad_for_score:
    df_opt["match_score"] = df_opt["search_text"].apply(
        lambda t: score_text(t, good_for_score, bad_for_score)
    )
else:
    # If no keyword lists are provided, default match_score to 0
    df_opt["match_score"] = 0

# Select useful columns if they exist
columns_to_keep = []
for col in ["title", "company", "location", "site", "search_term", "url", "description", "snippet", "match_score"]:
    if col in df_opt.columns:
        columns_to_keep.append(col)

df_opt = df_opt[columns_to_keep]

# Sort by match_score descending (higher = more matches)
df_opt = df_opt.sort_values("match_score", ascending=False)

df_opt.head(20)

In [None]:
# Deduplicate based on title + company + location + site
dedupe_keys = [c for c in ["title", "company", "location", "site"] if c in df_opt.columns]
df_opt_unique = df_opt.drop_duplicates(subset=dedupe_keys, keep="first").reset_index(drop=True)

print(f"After deduplication: {len(df_opt_unique)} roles\n")

# Show a preview
preview_cols = [c for c in ["title", "company", "location", "site", "job_url_direct", "match_score"] if c in df_opt_unique.columns]
df_opt_unique[preview_cols].head(20)


In [None]:
# Save to CSV
output_path = "filtered_internships.csv"
df_opt_unique.to_csv(output_path, index=False)
print(f"Saved {len(df_opt_unique)} roles to: {output_path}")
print(f"Visa/OPT filters {'ENABLED' if USE_VISA_FILTERS else 'DISABLED'}.")

In [None]:
print("=== Summary ===")
print(f"Total raw jobs scraped: {len(raw_df)}")
print(f"Internship-like roles: {len(df_interns)}")
print(f"Roles after filtering (before dedupe): {len(df_opt)}")
print(f"Unique roles after dedupe: {len(df_opt_unique)}")

if USE_VISA_FILTERS:
    print("\nVisa/OPT filters were ENABLED.")
else:
    print("\nVisa/OPT filters were DISABLED (generic filtering only).")

print("\nSample of results:")
sample_cols = [c for c in ["title", "company", "location", "site", "match_score"] if c in df_opt_unique.columns]
df_opt_unique[sample_cols].head(10)