In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [0]:
# St. Louis area universities
universities = [
    {
        'name': 'Washington University in St. Louis',
        'url': 'https://en.wikipedia.org/wiki/Washington_University_in_St._Louis'
    },
    {
        'name': 'Saint Louis University',
        'url': 'https://en.wikipedia.org/wiki/Saint_Louis_University'
    },
    {
        'name': 'University of Missouri–St. Louis',
        'url': 'https://en.wikipedia.org/wiki/University_of_Missouri%E2%80%93St._Louis'
    },
    {
        'name': 'Webster University',
        'url': 'https://en.wikipedia.org/wiki/Webster_University'
    },
    {
        'name': 'Maryville University',
        'url': 'https://en.wikipedia.org/wiki/Maryville_University'
    },
    {
        'name': 'Missouri Baptist University',
        'url': 'https://en.wikipedia.org/wiki/Missouri_Baptist_University'
    },
    {
        'name': 'Harris–Stowe State University',
        'url': 'https://en.wikipedia.org/wiki/Harris%E2%80%93Stowe_State_University'
    },
    {
        'name': 'Lindenwood University',
        'url': 'https://en.wikipedia.org/wiki/Lindenwood_University'
    }
]

# User agent will help with scraping by making the request look more like a real browser request rather than a bot or script. 
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# Creating a data list as a bucket for the scraped data. 
all_data = []


In [0]:

for university in universities:
    try:
        print(f"Scraping: {university['name']}...")
        
        response = requests.get(university['url'], headers=headers, timeout=30)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the infobox
            infobox = soup.find('table', class_='infobox')
            
            enrollment = None
            city = None
            state = None
            coordinates = None
            
            if infobox:
                rows = infobox.find_all('tr')
                
                for row in rows:
                    header = row.find('th')
                    
                    if header:
                        header_text = header.text.strip().lower()
                        value = row.find('td')
                        
                        # Extract enrollment
                        if 'student' in header_text and value:
                            text = value.text.strip()
                            match = re.search(r'([\d,]+)', text)
                            if match:
                                enrollment = match.group(1)
                        
                        # Extract location and coordinates
                        if 'location' in header_text and value:
                            # Get full text
                            location_text = value.get_text(separator=' ', strip=True)
                            
                            # Extract coordinates BEFORE cleaning
                            coord_match = re.search(r'(\d+\.?\d*°\s*[NS]\s+\d+\.?\d*°\s*[EW])', location_text)
                            if coord_match:
                                coordinates = coord_match.group(1)
                            
                            # Clean location text
                            # Remove citations [1], [2], etc.
                            location_clean = re.sub(r'\[.*?\]', '', location_text)
                            
                            # Remove coordinates
                            location_clean = re.sub(r'\d+\.?\d*°.*', '', location_clean)
                            
                            # Remove "postal address" phrase
                            location_clean = re.sub(r'postal address', '', location_clean, flags=re.IGNORECASE)
                            
                            # Remove "United States" and variations
                            location_clean = re.sub(r',?\s*U\.?S\.?A?\.?', '', location_clean, flags=re.IGNORECASE)
                            location_clean = re.sub(r',?\s*United States', '', location_clean, flags=re.IGNORECASE)
                            
                            # Remove street addresses (patterns like numbers followed by street names)
                            location_clean = re.sub(r'\d+\s+[\w\s]+(?:Drive|Street|Road|Avenue|Lane|Boulevard|Way|Court|Dr|St|Rd|Ave|Ln|Blvd)\s*,?\s*', '', location_clean, flags=re.IGNORECASE)
                            
                            # Remove zip codes (5 digits or 5+4 format)
                            location_clean = re.sub(r'\b\d{5}(?:-\d{4})?\b', '', location_clean)
                            
                            # Clean up extra spaces and commas
                            location_clean = re.sub(r'\s+', ' ', location_clean)  # Multiple spaces to single
                            location_clean = re.sub(r',\s*,', ',', location_clean)  # Double commas
                            location_clean = location_clean.strip(' ,')  # Leading/trailing spaces and commas
                            
                            # Split into city and state
                            # Expected format: "City, State" or just "City"
                            if ',' in location_clean:
                                parts = location_clean.split(',')
                                city = parts[0].strip()
                                state = parts[1].strip() if len(parts) > 1 else None
                            else:
                                city = location_clean.strip()
                                state = None

                            # Clean up state so the column all has the same value. 
                            if state == 'MO': 
                                state = 'Missouri'
            
            all_data.append({
                'University': university['name'],
                'City': city,
                'State': state,
                'Coordinates': coordinates,
                'Student Enrollment': enrollment,
                'Source URL': university['url']
            })
            
            print(f"  ✓ City: {city}")
            print(f"  ✓ State: {state}")
            print(f"  ✓ Coordinates: {coordinates}")
            print(f"  ✓ Enrollment: {enrollment}\n")
            
        else:
            print(f"  ✗ Failed with status {response.status_code}\n")
            all_data.append({
                'University': university['name'],
                'City': None,
                'State': None,
                'Coordinates': None,
                'Student Enrollment': None,
                'Source URL': university['url']
            })
            
    except Exception as e:
        print(f"  ✗ Error: {e}\n")
        all_data.append({
            'University': university['name'],
            'City': None,
            'State': None,
            'Coordinates': None,
            'Student Enrollment': None,
            'Source URL': university['url']
        })

In [0]:
# Create DataFrame with all of the scraped data
df_scraped = pd.DataFrame(all_data)
df_scraped.display()

In [0]:
# Connecting to the API using the request module we pulled in earlier. 
lat = 38.63
log = -90.20
start_date = "2026-01-01"
end_date = "2026-01-31"
base_url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": lat,
    "longitude": log,
    "start_date": start_date,
    "end_date": end_date,
    "daily": "temperature_2m_mean,temperature_2m_max,temperature_2m_min,snowfall_sum,precipitation_sum",
    "temperature_unit": "fahrenheit",
    "timezone": "auto",
    "precipitation_unit": "inch"
}

In [0]:
# request the data from the api and save as a data frame as well. 
weather_response = requests.get(base_url, params=params)
weather_data = weather_response.json()
weather_data

In [0]:
# Transforming my weather data into a data frame
df_weather = pd.DataFrame({
    'date': weather_data['daily']['time'],
    'temp_max': weather_data['daily']['temperature_2m_max'],
    'temp_min': weather_data['daily']['temperature_2m_min'],
    'temp_mean': weather_data['daily']['temperature_2m_mean'],
    'precipitation': weather_data['daily']['precipitation_sum'],
    'snowfall': weather_data['daily']['snowfall_sum']
})
df_weather

### WEATHER Classifications: 
1. Average Daily Temp below 20 Degrees F. 
2. Minimum Temp below 15 Degrees F. 
3. Snowfall is more than 2 inches in a given day. 
4. Any day with more than .15 inch of precipitation while AVG Daily temp below freezing. 

In [0]:
# defining a function to classify severe and non severe weather conditions. 

def classify_weather(row):
  temp_mean = row['temp_mean']
  temp_min = row['temp_min']
  precipitation = row['precipitation']
  snowfall = row['snowfall']

  if (temp_mean <= 20) or (temp_min <= 15) or (snowfall >= 2) or (precipitation >= .15 and temp_mean < 32):
    return 'severe'
  else: 
    return 'not_severe'


In [0]:
# Using the defined function and applying to the established weather dataframe. 
df_weather['weather_severity'] = df_weather.apply(classify_weather, axis=1)
df_weather

In [0]:
# Calculating the number of severe weather days in St. Louis in January. 
severe_days = (df_weather['weather_severity'] == 'severe').sum()
print(f"Number of severe days: {severe_days}")

In [0]:
# We need to clean my student enrollment column since it contains a column and is coming over as a string
df_scraped['Student Enrollment'] = df_scraped['Student Enrollment'].str.replace(',', '').astype(int)

# Defining a function to complete row by row student / severe day calculation. 
def students_per_severe_day(row):
  enrollment = row['Student Enrollment']
  student_severe_days = enrollment * severe_days
  return student_severe_days

In [0]:
# Using the students_per_severe_day function and applying to the existing scrapped data frame. 
df_scraped['students_per_severe_day'] = df_scraped.apply(students_per_severe_day, axis=1)
df_scraped

In [0]:
# Creating the final dataframe. 
df_final = pd.DataFrame()
df_final['University'] = df_scraped['University']
df_final['State'] = df_scraped['State']
df_final['Enrolled_Students'] = df_scraped['Student Enrollment']
df_final['Severe_Days'] = severe_days
df_final['Severe_Days_Per_Student'] = df_scraped['students_per_severe_day']
df_final['Total_STL_Severe_Student_Days'] = df_scraped['students_per_severe_day'].sum()
df_final