# Collect Class Infomation
Class data is collected from the OSU course catalog and the barrett.3 course information website.
The OSU course catalog has useful and clean data on where classes/labs/recitations are, their start and end dates, and times;
however, their enrollment data is not useful since it aggregates enrollments of all sections of a course into one number.
barrett.3's data has enrollment info on each individual section for a course.

In [None]:
import requests
import json
import os
import shutil
import pandas as pd
from datetime import datetime, time
from time import sleep

In [None]:
barret_subjects = [
    'ACADAFF', 'ACCAD', 'ACCTMIS', 'ACEL', 'AEDECON', 'AEE', 'AEROENG', 'AFAMAST', 'AGRCOMM', 'AGSYSMGT', 'AGSYSMT', 'AIRSCI', 'ALLIMED', 'ANATOMY', 'ANESTHES', 'ANIMSCI', 'ANMLTEC', 'ANTHROP', 'ARABIC', 'ARCH', 'ART', 'ARTEDUC', 'ARTSCOL', 'ARTSSCI', 'ASE', 'ASL', 'ASTRON', 'ATHTRNG', 'ATMOSSC', 'AVIATION', 'AVIATN', 'BCS', 'BIOCHEM', 'BIOCHEMP', 'BIOETHC', 'BIOLOGY', 'BIOMEDE',
    'BIOMINF', 'BIOMSCI', 'BIOPHRM', 'BIOPHYS', 'BIOSCI', 'BIOSTAT', 'BIOTECH', 'BIOWMGT', 'BMEA', 'BMI', 'BSGP', 'BUSADM', 'BUSFIN', 'BUSMGT', 'BUSMHR', 'BUSML', 'BUSOBA', 'BUSTEC', 'CATALAN', 'CBE', 'CBG', 'CHBE', 'CHEM', 'CHEMPHY', 'CHINESE', 'CIRTECH', 'CIVILEN', 'CLAS', 'CLASSICS', 'CLLC', 'COMLDR', 'COMM', 'COMPSTD', 'CONSCI', 'CONSYSM', 'CONSYSMT', 'CRPLAN', 'CRPSOIL', 'CSCFFS',
    'CSCFMFNS', 'CSE', 'CSFMRSM', 'CSFRST', 'CSFSNRTS', 'CSHSPMG', 'CSTW', 'CSTXTCL', 'CZECH', 'DANCE', 'DENT', 'DENTHYG', 'DESIGN', 'DNE', 'DSABLST', 'EALL', 'EARTHSC', 'EARTHSCI', 'ECE', 'ECON', 'EDUCST', 'EDUPAES', 'EDUPL', 'EDUTL', 'EEOB', 'EEURLL', 'EHE', 'EMERGMED', 'ENGINEER', 'ENGLISH', 'ENGR', 'ENGRAPH', 'ENGREDU', 'ENGRTEC', 'ENGTECH', 'ENR', 'ENTMLGY', 'ENTOMOL', 'ENVENG', 'ENVSCI',
    'ENVSCT', 'ESCE', 'ESCFE', 'ESEADM', 'ESEPHL', 'ESEPOL', 'ESEPSY', 'ESETEC', 'ESHESA', 'ESLTECH', 'ESPHE', 'ESQREM', 'ESQUAL', 'ESSPED', 'ESSPSY', 'ESTEPL', 'ESWDE', 'EXP', 'EXPLORNG', 'FABENG', 'FAES', 'FCSED', 'FDSCTE', 'FILMSTD', 'FMRESM', 'FRENCH', 'FRIT', 'GENBIOL', 'GENCHEM', 'GENCOMM', 'GENED', 'GENHUM', 'GENMATH', 'GENSSC', 'GENSTDS', 'GEODSCIE', 'GEODSCIM', 'GEOG', 'GEORGIAN', 'GEOSCIM',
    'GERMAN', 'GRADSCH', 'GRADTDA', 'GREEK', 'HCINNOV', 'HCS', 'HDFS', 'HEBREW', 'HECCREG', 'HIMS', 'HINDI', 'HISTART', 'HISTORY', 'HONORS', 'HORTTEC', 'HOSPMGT', 'HSMP', 'HTHRHSC', 'HUMANEC', 'HUMCOL', 'HUMNNTR', 'HUNGARIN', 'HUNGRN', 'HW', 'HWIH', 'IBGP', 'INDENG', 'INTMED', 'INTSTDS', 'ISE', 'ISLAM', 'ITALIAN', 'JAPANESE', 'JAPANSE', 'JEWSHST', 'KINESIO', 'KNHES', 'KNOW', 'KNPE', 'KNSFHP',
    'KNSISM', 'KOREAN', 'LABBIOSC', 'LARCH', 'LATIN', 'LAW', 'LING', 'LINGUIST', 'MATH', 'MATSCEN', 'MBA', 'MCDBIO', 'MCR', 'MDN', 'MDRNGRK', 'MEATSCI', 'MECHENG', 'MEDCOLL', 'MEDDIET', 'MEDIEVAL', 'MEDLBS', 'MEDMCIM', 'MEDREN', 'MEDTECH', 'MICRBIO', 'MICRBIOL', 'MILSCI', 'MOLBIOC', 'MOLBIOCH', 'MOLGEN', 'MPSCOL', 'MUSIC', 'MVIMG', 'MVNGIMG', 'NAVALSC', 'NELC', 'NEURO',
    'NEUROGS', 'NEUROGSP', 'NEUROSC', 'NEURSGY', 'NRSADVN', 'NRSPRCT', 'NUCLREN', 'NURSING', 'NURSPRCT', 'OCCTHER', 'OPTHLMOL', 'OPTOM', 'OPTOMTRY', 'ORIENTAT', 'OSBP', 'OTOLARN', 'OTOLARYN', 'PATHOL', 'PDATRICS', 'PEDS', 'PERSIAN', 'PHARMACY', 'PHARMCL', 'PHARMCOL', 'PHILOS', 'PHR', 'PHYSICS', 'PHYSIO', 'PHYSIOCB', 'PHYSMED', 'PHYSTHER', 'PHYSTHR', 'PLNTBIO', 'PLNTPTH', 'POLISH', 'POLITSC', 'PORTGESE',
    'PORTGSE', 'PSYBHLH', 'PSYCH', 'PSYCHTRY', 'PUBAFRS', 'PUBHBIO', 'PUBHEHS', 'PUBHEPI', 'PUBHHBP', 'PUBHHMP', 'PUBHLTH', 'PUBPOLM', 'QUECHUA', 'RADIOLG', 'RADIOLGY', 'RADSCI', 'RELSTDS', 'RESPTHER', 'RESPTHR', 'RNEWNRG', 'ROMANIA', 'ROMANIAN', 'ROMLING', 'ROOM', 'RURLSOC', 'RUSSIAN', 'SANSKRIT', 'SANSKRT', 'SASIA', 'SBSCOL', 'SCANDNAV', 'SCANDVN', 'SCHOLAR', 'SLAVIC', 'SOCIOL', 'SOCWORK', 'SOMALI',
    'SPANISH', 'SPHHRNG', 'SRBCROA', 'STAT', 'STEP', 'SUMMARY', 'SURGERY', 'SWAHILI', 'SWEDISH', 'SXLTYST', 'TECPHYS', 'THEATRE', 'TIBETAN', 'TURKISH', 'TXTLCLO', 'URDU', 'USAS', 'UZBEK', 'VETBIOS', 'VETCLIN', 'VETPREV', 'VISSCI', 'VMCOLL', 'VOCEDUC', 'WELDENG', 'WGSST', 'WOMSTDS', 'YIDDISH', 'YORUBA', 'ZULU'
]

url = 'https://content.osu.edu/v2/classes/search'
params = {
    'q': '', # Query
    'client': 'class-search-ui',
    'campus': 'col',
    'term': 1252, # SP25=1252, SU25=1254, AU25=1258
    'p': 1, # Page
    'subject': ''
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:136.0) Gecko/20100101 Firefox/136.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Origin': 'https://classes.osu.edu',
    'DNT': '1',
    'Sec-GPC': '1',
    'Connection': 'keep-alive',
    'Referer': 'https://classes.osu.edu/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'Priority': 'u=0'
}

osu_subjects = []
with open('osu_subjects.json', 'r') as f:
    osu_subjects_info = json.load(f)
for s in osu_subjects_info:
    osu_subjects.append(s['term'])

### Collect Data from OSU Course Catalog

In [None]:
total_items = 0
all_courses = []
all_sections = []
for subject in osu_subjects:
    current_page = 1
    subject_courses = []
    subject_sections = []
    params['subject'] = subject
    params['p'] = current_page

    try:
        r = requests.get(url, params, headers=headers)
    except TimeoutError as er:
        print(f'Timed out on subject {subject} on page {current_page}')
        sleep(5)
    if r.status_code != 200:
        print(f'ERROR {r.status_code} getting subject: "{subject}"')
        continue
    data = r.json()['data']
    subject_total_items = data['totalItems']
    total_items += subject_total_items
    total_pages = data['totalPages']

    while current_page <= total_pages:
        subject_courses.extend(data['courses'])
        current_page += 1
        params['p'] = current_page
        try:
            r = requests.get(url, params, headers=headers)
        except TimeoutError as er:
            print(f'Timed out on subject {subject} on page {current_page}')
            sleep(5)
        if r.status_code != 200:
            print(f'ERROR {r.status_code} getting subject: "{subject}" at page ')
            continue
        data = r.json()['data']
    
    for course in subject_courses:
        for section in course['sections']:
            section['catalogNumber'] = f'{course['subject']} {course['catalogNumber']}'
            section['title'] = course['title']
            subject_sections.append(section)

    if len(subject_sections) != subject_total_items:
        print(f'{subject} item count mismatch. Expected {subject_total_items}, found {len(subject_sections)} items')
        continue
    all_courses.extend(subject_courses)
    all_sections.extend(subject_sections)
    sleep(0.1)

In [None]:
all_sections = []
for course_info in all_courses:
    course = course_info['course']
    for section in course_info['sections']:
        section['catalogNumber'] = f'{course['subject']} {course['catalogNumber']}'
        section['title'] = course['title']
        all_sections.append(section)

In [None]:
all_meetings = []
for section in all_sections:
    for meeting in section['meetings']:
        if 'instructors' in meeting:
            meeting.pop('instructors')
        meeting['term'] = section['term']
        meeting['catalogId'] = section['catalogNumber']
        meeting['classTitle'] = section['title']
        meeting['classNumber'] = section['classNumber']
        meeting['section'] = section['section']
        meeting['sectionEnrollment'] = section['enrollmentTotal']
        all_meetings.append(meeting)

In [None]:
meetings_df = pd.DataFrame(all_meetings)

In [None]:
del all_sections
del section
del meeting
del course
del course_info
del r
del f
del s
del subject_courses
del subject_sections

### Collect Data from barrett.3's Catalog

In [None]:
# Download and save text files
for s in barret_subjects:
    url = f'https://www.asc.ohio-state.edu/barrett.3/schedule/{s}/{params['term']}.txt'
    r = requests.get(url)

    if r.status_code != 200:
        print(f'{r.status_code}\t{s}')
        continue

    with open(os.path.join('Courses', f'{s}.txt'), 'wb') as course_file:
        course_file.write(r.content)
    sleep(0.1)

In [None]:
# Read each text file and convert to into a dataframe
course_files = os.listdir('Courses')
column_widths = [20, 10, 2, 17, 9, 12, 11, 7, 100]
colspec = [
    (0, 20),
    (20, 25),
    (25, 31),
    (31, 34),
    (34, 49),
    (49, 58),
    (59, 70),
    (70, 81),
    (81, 89),
    (89, 94),
    (94, 200)
]
all_courses_df = pd.DataFrame()
for course_file_name in course_files:
    footer_start = 0
    path = os.path.join('Courses', course_file_name)
    with open(path, 'r') as course_file:
        for line_num, line in enumerate(course_file, start=1):
            if "INDependent study classes" in line:
                footer_start = line_num
    
    if not footer_start:
        skipfooter = 0
    else:
        skipfooter = line_num - footer_start + 1
    
    df = pd.read_fwf(path, skipfooter=skipfooter, skiprows=3, header=None, colspecs=colspec)
    if df.empty:
        continue
    df = df.dropna(subset=0).reset_index(drop=True)
    df.rename({
            0: 'course',
            1: 'campus',
            2: 'class_number',
            3: 'component',
            4: 'auto_enrolls',
            5: 'days',
            6: 'times',
            7: 'location',
            8: 'enrolled_status',
            9: 'waitlist',
            10: 'instructor'
        }, inplace=True, axis=1
    )
    df[['enrolled', 'limit']] = df['enrolled_status'].str.split('/', expand=True)
    df.drop('enrolled_status', axis=1, inplace=True)
    all_courses_df = pd.concat([all_courses_df, df], ignore_index=True)

In [None]:
# Filter out all online courses and regional campus courses
all_courses_df.drop_duplicates(inplace=True)
filtered_courses = all_courses_df[
    (all_courses_df['location'] != 'ONLINE') &
    (all_courses_df['enrolled'] != '0') &
    (all_courses_df['campus'].isnull()) &
    (all_courses_df['location'].notnull())
]
filtered_courses = filtered_courses.astype({
    'enrolled': 'Int16',
    'limit': 'Int16',
    'class_number': 'Int32'
})
filtered_courses = filtered_courses[['course', 'class_number', 'enrolled', 'limit']]

In [67]:
meetings_df = meetings_df.astype({
    'classNumber': 'Int32',
    'sectionEnrollment': 'Int32'
})

In [None]:
# Merge the two data sets, some courses are lost in this process but it's negligible
merged = pd.merge(
    left=filtered_courses,
    right=meetings_df,
    left_on='class_number',
    right_on='classNumber',
    how='inner'
)
merged = merged[
    (merged['facilityDescription'] != 'ONLINE') &
    (merged['facilityDescription'].notnull())
]

In [None]:
merged.to_csv('class_data.csv', index=False)

In [None]:
# Delete all the downloaded files
shutil.rmtree('Courses')

# Parse Traffic Data

### Split sidewalk centerline data into segments so that it can be graded


In [None]:
import geopandas as gpd
import json
from shapely.geometry import LineString
from shapely.ops import split
from shapely.geometry import Point