In [2]:
# imports
import urllib3
from urllib3.util.ssl_ import create_urllib3_context
from urllib.parse import urljoin
from bs4 import BeautifulSoup

import pandas as pd
import tabula

import logging
from flask import Flask, request, render_template, session, redirect

In [3]:
# url
url = "https://web-as.tamu.edu/gradereports/"

In [4]:
# create custom context
ctx = create_urllib3_context()
ctx.load_default_certs()
ctx.options |= 0x4

In [5]:
# create PoolManager instance to make requests
http = urllib3.PoolManager(ssl_context=ctx)

In [6]:
# get HTTPReponse object
read = http.request("GET", url)

In [7]:
# parse HTML content using beautifulsoup
html = read.data
soup = BeautifulSoup(html, "html.parser")

In [8]:
# find filter elements
year = soup.find("select", {"name": "ctl00$plcMain$lstGradYear"})
sem = soup.find("select", {"name": "ctl00$plcMain$lstGradTerm"})
college = soup.find("select", {"name": "ctl00$plcMain$lstGradCollege"})

# all years
year_options = year.find_all("option")
year_list = [option["value"] for option in year_options]
year_list = year_list[0:4]

# all sems (spring, summer, fall)
sem_list = ["1", "2", "3"]

# all colleges
college_options = college.find_all("option")
college_remove = ["DN_PROF", "DT_PROF", "SL_PROF", "MD_PROF", "MN_PROF", "UT"]
college_list = [option["value"] for option in college_options if option["value"] not in college_remove]

In [9]:
# get all pdf urls
base_url = "https://web-as.tamu.edu/GradeReports/PDFReports/"
pdf_urls = []

for year in year_list:
    for sem in sem_list:
        for col in college_list:
            pdf_url = f"{year}{sem}/grd{year}{sem}{col}.pdf"
            full_url = urljoin(base_url, pdf_url)
            
            response = http.request("HEAD", full_url)

            if response.status == 200 and response.headers['Content-Type'] == 'application/pdf':
                pdf_urls.append(full_url)

In [10]:
# pdf metadata

top = 100
left = 30
width = 720
height = 500

table_area = [top, left, top + height, left + width]
table_x_coords = [130, 177, 222, 267, 314, 359, 404, 440, 473, 505, 537, 568, 600, 642, 750]

top_d = 73
left_d = 33
w_d = 270
h_d = 29

c_area = [top_d, left_d, top_d + h_d, left_d + w_d]

In [11]:
def get_tables(pdf):

    # read all grade tables and departments in one college pdf
    tables = tabula.read_pdf(pdf, pages = 'all', area=table_area, columns=table_x_coords)
    deps = tabula.read_pdf(pdf, pages = 'all', area=c_area, pandas_options={'header': None})

    # drop unnecessary columns
    tables = [table.dropna().drop(['I', 'S', 'U', 'X', 'A - F'], axis = 1) for table in tables]

    # create A, B, C, D, and F percentages
    convert = ['A', 'B', 'C', 'D', 'F']
    for table in tables:
        table[convert] = table[convert].astype(int)
        for col in convert:
            table[col + '_PER'] = round(table[col]/table['TOTAL'] * 100, 2)
        
        # split section into course
        split_sec = table['SECTION'].str.split('-')
        table['COURSE'] = split_sec.str[0] + " " + split_sec.str[1]
    
    tables = [table.drop('SECTION', axis = 1) for table in tables]
    
    if(len(tables) != len(deps)):
        raise Exception("table lengths not matching")
    
    relates = {}

    # {d1 : t1, d2 : t2, d3, t3}
    # relate each department to its respective table
    for i in range(0, len(tables)):
        department = deps[i][1][1]

        relates[department] = pd.concat([relates.get(department), tables[i]], axis=0)
        
    return relates

In [None]:
all_grd = []
done = False

count = 0
for url in pdf_urls:
    try:
        all_grd.append(get_tables(url))
        print(f"On URL: {count}")
        count += 1
    except Exception as e:
        pdf_urls.remove(url)
        logging.warning(f"Exception: {type(e).__name__} : {e} --- URL: {url}")

In [12]:
t = get_tables(pdf_urls[2])

In [13]:
print(list(t.keys()))

['ARCHITECTURE', 'CONSTRUCTION SCIENCE', 'LAND ARCH & URBAN PLANNING', 'SCHOOL OF ARCHITECTURE']


In [14]:
df = list(t.values())[3]
df.head()

Unnamed: 0,A,B,C,D,F,GPA,Q,TOTAL,INSTRUCTOR,A_PER,B_PER,C_PER,D_PER,F_PER,COURSE
0,16,7,1,0,2,3.346,0.0,26.0,REED K,61.54,26.92,3.85,0.0,7.69,CARC 101
2,22,3,1,0,0,3.807,0.0,26.0,DAVIS D,84.62,11.54,3.85,0.0,0.0,CARC 101
6,19,0,0,0,0,4.0,0.0,19.0,ROLDAN M,100.0,0.0,0.0,0.0,0.0,CARC 301
10,19,0,0,0,0,4.0,0.0,19.0,ROLDAN M,100.0,0.0,0.0,0.0,0.0,CARC 311
14,19,0,0,0,0,4.0,0.0,19.0,ROLDAN M,100.0,0.0,0.0,0.0,0.0,CARC 331


In [15]:
df.to_csv('./grds/carc.csv', index=False)