In [1]:
pip install pandas requests beautifulsoup4 openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    """Clean text by removing extra whitespace and newlines"""
    return ' '.join(text.strip().split())

def extract_application_link(cell):
    """Extract application link from a cell that might contain multiple links"""
    links = cell.find_all('a')
    for link in links:
        href = link.get('href', '')
        if not href.startswith('#') and not href.startswith('https://github.com'):
            return href
    return None

def scrape_simplify_jobs_table(url="https://github.com/SimplifyJobs/Summer2025-Internships"):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:

        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('h2', string=re.compile(r'the.list', re.IGNORECASE))
        if table:
            table = table.find_next('table')
        if not table:
            raise ValueError("Could not find the internship table")
            
        companies = []
        roles = []
        locations = []
        app_links = []
        date_posted = []

        for row in table.find_all('tr')[1:]:  
            cols = row.find_all('td')
            if len(cols) >= 5:  
                companies.append(clean_text(cols[0].text))
                roles.append(clean_text(cols[1].text))
                locations.append(clean_text(cols[2].text))
                app_links.append(extract_application_link(cols[3]))
                date_posted.append(clean_text(cols[4].text))


        df = pd.DataFrame({
            'Company': companies,
            'Role': roles,
            'Location': locations,
            'Application Link': app_links,
            'Date Posted': date_posted
        })

        df = df.dropna(subset=['Application Link'])
        excel_filename = 'summer2025_internships.xlsx'
        df.to_excel(excel_filename, index=False)
        print(f"\nSuccessfully saved {len(df)} internship listings to {excel_filename}")
        
        return df
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the GitHub page: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
        raise

def main():
    print("Scraping Summer 2025 Internships list...")
    df = scrape_simplify_jobs_table()
    
    if df is not None:
        print("\nFirst few internship listings:")
        print(df.head())
        print(f"\nTotal number of internships with valid application links: {len(df)}")

if __name__ == "__main__":
    main()

Scraping Summer 2025 Internships list...

Successfully saved 466 internship listings to summer2025_internships.xlsx

First few internship listings:
        Company                                               Role  \
0      Workleap  Software Developer Intern - Ruby on Rails and ...   
1         Waymo  2025 Summer Intern - BS/MS - ML Infra - Full-S...   
2     Snowflake   Software Engineer Intern - Toronto - Summer 2025   
3     Snap Inc.             Software Engineer Intern - Summer 2025   
4  Lucid Motors                  Intern, IT Strategy , Summer 2025   

                                   Location  \
0                          Remote in Canada   
1                         Mountain View, CA   
2                       Toronto, ON, Canada   
3  4 locationsPalo Alto, CASeattle, WALANYC   
4                                Newark, CA   

                                    Application Link Date Posted  
0  https://job-boards.greenhouse.io/workleap/jobs...      Oct 17  
1  https://car