
Extract, parse, and save as separate CSV files all internship role categories and their corresponding data from the https://github.com/SimplifyJobs/Summer2024-Internships/blob/dev/README.md file, then display all extracted data in a structured format.

In [2]:
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests # Import the requests library

# Define the GitHub raw URL
github_raw_url = "https://raw.githubusercontent.com/SimplifyJobs/Summer2026-Internships/dev/README.md"

# Fetch the content of the README.md file
try:
    response = requests.get(github_raw_url)
    response.raise_for_status() # Raise an exception for HTTP errors
    readme_content = response.text
    print(f"Successfully fetched README content from {github_raw_url}")
    # print(readme_content[:500]) # Print first 500 characters to verify
except requests.exceptions.RequestException as e:
    print(f"Error fetching README content: {e}")
    readme_content = "" # Initialize with empty string on error

# Define a regular expression pattern to find level 2 headings
# It looks for '## ' followed by any characters until a newline
heading_pattern = re.compile(r'^##\s(.+)$', re.MULTILINE)

# Find all matching headings in the readme_content
section_headings = heading_pattern.findall(readme_content)

# Print the identified headings
print("Identified Section Headings:")
for heading in section_headings:
    print(f"- {heading.strip()}")

# Initialize an empty dictionary to store extracted DataFrames
all_sections_data = {}

# Iterate through each section heading to extract its content
for i, current_heading in enumerate(section_headings):
    # Find the starting position of the current heading in readme_content
    current_heading_full = f"## {current_heading}"
    start_index = readme_content.find(current_heading_full)

    if start_index == -1:
        continue

    # Determine the end position of the current section
    end_index = -1
    if i + 1 < len(section_headings):
        next_heading_full = f"## {section_headings[i+1]}"
        end_index = readme_content.find(next_heading_full, start_index + len(current_heading_full))

    # Extract the raw markdown content for the current section
    if end_index != -1:
        section_content = readme_content[start_index + len(current_heading_full):end_index].strip()
    else:
        section_content = readme_content[start_index + len(current_heading_full):].strip()

    section_dfs = [] # To store multiple tables if a section has them

    # Use BeautifulSoup to parse HTML tables
    soup = BeautifulSoup(section_content, 'lxml')
    tables = soup.find_all('table')

    for table in tables:
        headers = []
        # Check if the table has a thead
        if table.find('thead'):
            for th in table.find('thead').find_all('th'):
                headers.append(th.get_text(strip=True))

        data_rows = []
        # Check if the table has a tbody
        if table.find('tbody'):
            for tr in table.find('tbody').find_all('tr'):
                row_values = []
                for idx, td in enumerate(tr.find_all('td')):
                    # If the column header is 'Application', try to extract the href
                    if headers and idx < len(headers) and headers[idx] == 'Application':
                        link = td.find('a')
                        if link and 'href' in link.attrs:
                            row_values.append(link['href'])
                        else:
                            row_values.append('') # No link found
                    else:
                        # Get text, strip whitespace, and handle cases where content might be nested in links etc.
                        row_values.append(td.get_text(strip=True))

                # Ensure the row has the same number of columns as the header
                if len(headers) > 0: # Only process if headers are found
                    if len(row_values) > len(headers):
                        row_values = row_values[:len(headers)] # Truncate if too many
                    elif len(row_values) < len(headers):
                        row_values.extend([''] * (len(headers) - len(row_values))) # Pad if too few
                    data_rows.append(row_values)

        # Create a pandas DataFrame
        if headers and data_rows:
            df = pd.DataFrame(data_rows, columns=headers)
            section_dfs.append(df)

    # Store the first DataFrame found in the section, if any
    if section_dfs:
        all_sections_data[current_heading] = section_dfs[0]
    else:
        print(f"No HTML table found for section: {current_heading}")
        all_sections_data[current_heading] = None

print(f"Extracted data for {len(all_sections_data)} sections.")
# Display the keys (section headings) for which data was extracted
for heading, df in all_sections_data.items():
    if df is not None:
        print(f"\n--- {heading} ---")
        print(df.head())
    else:
        print(f"\n--- {heading} (No data extracted) ---")

# Define the directory to save the CSV files
output_directory = "extracted_internship_data"
os.makedirs(output_directory, exist_ok=True)

# Save each DataFrame to a separate CSV file
for heading, df in all_sections_data.items():
    if df is not None:
        # Sanitize heading to create a valid filename
        filename = os.path.join(output_directory, f"{heading.replace(' ', '_').replace('/', '_').replace(':', '')}.csv")
        df.to_csv(filename, index=False)
        print(f"Saved data for '{heading}' to {filename}")
    else:
        print(f"No data to save for '{heading}'")

print("\n--- Displaying All Extracted Data ---")
for heading, df in all_sections_data.items():
    if df is not None:
        print(f"\n### {heading}")
        # Display the full DataFrame, ensuring all rows and columns are visible
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print(df.to_string()) # Using to_string() for full display
        print("\n" + "-" * 80 + "\n") # Separator for readability
    else:
        print(f"\n### {heading} (No data to display)")
        print("\n" + "-" * 80 + "\n") # Separator for readability

Successfully fetched README content from https://raw.githubusercontent.com/SimplifyJobs/Summer2026-Internships/dev/README.md
Identified Section Headings:
- ðŸ’» Software Engineering Internship Roles
- ðŸ“± Product Management Internship Roles
- ðŸ¤– Data Science, AI & Machine Learning Internship Roles
- ðŸ“ˆ Quantitative Finance Internship Roles
- ðŸ”§ Hardware Engineering Internship Roles
Extracted data for 5 sections.

--- ðŸ’» Software Engineering Internship Roles ---
                                   Company  \
0                            Commerce Bank   
1  Dallas Fort Worth International Airport   
2                             CoStar Group   
3                                   Circle   
4                      Danaher Corporation   

                                                Role  \
0                                          Intern IT   
1  Undergraduate Intern - IT Applications Develop...   
2                                  Technology Intern   
3                       