In [69]:
# imports
import urllib3
from urllib3.util.ssl_ import create_urllib3_context
from urllib.parse import urljoin
from bs4 import BeautifulSoup

import pandas as pd
import tabula

import logging
from pymongo import MongoClient

In [18]:
# url
url = "https://web-as.tamu.edu/gradereports/"

In [19]:
# create custom context
ctx = create_urllib3_context()
ctx.load_default_certs()
ctx.options |= 0x4

In [20]:
# create PoolManager instance to make requests
http = urllib3.PoolManager(ssl_context=ctx)

In [21]:
# get HTTPReponse object
read = http.request("GET", url)

In [22]:
# parse HTML content using beautifulsoup
html = read.data
soup = BeautifulSoup(html, "html.parser")

In [23]:
# find filter elements
year = soup.find("select", {"name": "ctl00$plcMain$lstGradYear"})
sem = soup.find("select", {"name": "ctl00$plcMain$lstGradTerm"})
college = soup.find("select", {"name": "ctl00$plcMain$lstGradCollege"})

# all years
year_options = year.find_all("option")
year_list = [option["value"] for option in year_options]
year_list = year_list[0:4]

# all sems (spring, summer, fall)
sem_list = ["1", "2", "3"]

# all colleges
college_options = college.find_all("option")
college_remove = ["DN_PROF", "DT_PROF", "SL_PROF", "MD_PROF", "MN_PROF", "UT"]
college_list = [option["value"] for option in college_options if option["value"] not in college_remove]

In [24]:
# get all pdf urls
base_url = "https://web-as.tamu.edu/GradeReports/PDFReports/"
pdf_urls = []

for year in year_list:
    for sem in sem_list:
        for col in college_list:
            pdf_url = f"{year}{sem}/grd{year}{sem}{col}.pdf"
            full_url = urljoin(base_url, pdf_url)
            
            response = http.request("HEAD", full_url)

            if response.status == 200 and response.headers['Content-Type'] == 'application/pdf':
                pdf_urls.append(full_url)

In [25]:
# pdf metadata

top = 100
left = 30
width = 720
height = 500

table_area = [top, left, top + height, left + width]
table_x_coords = [130, 177, 222, 267, 314, 359, 404, 440, 473, 505, 537, 568, 600, 642, 750]

top_d = 73
left_d = 33
w_d = 270
h_d = 29

c_area = [top_d, left_d, top_d + h_d, left_d + w_d]

In [58]:
def get_tables(pdf):

    # read all grade tables and departments in one college pdf
    tables = pd.concat(tabula.read_pdf(pdf, pages = 'all', area=table_area, columns=table_x_coords))
    # deps = tabula.read_pdf(pdf, pages = 'all', area=c_area, pandas_options={'header': None})

    # drop unnecessary columns
    tables = tables.dropna().drop(['I', 'S', 'U', 'X', 'A - F'], axis = 1)

    # create A, B, C, D, and F percentages
    convert = ['A', 'B', 'C', 'D', 'F']
    tables[convert] = tables[convert].astype(int)
    for col in convert:
        tables[col + ' (%)'] = round(tables[col]/tables['TOTAL'] * 100, 2)
        
    # split section into course
    split_sec = tables['SECTION'].str.split('-')
    tables['COURSE'] = split_sec.str[0] + " " + split_sec.str[1]
    
    tables = tables.drop(['SECTION', 'TOTAL'], axis = 1)
    tables = tables.rename(columns={'INSTRUCTOR' : 'PROF'})
    
    # if(len(tables) != len(deps)):
    #     raise Exception("table lengths not matching")
    
    # relates = {}

    # # {d1 : t1, d2 : t2, d3, t3}
    # # relate each department to its respective table
    # for i in range(0, len(tables)):
    #     department = deps[i][1][1]

    #     relates[department] = pd.concat([relates.get(department), tables[i]], axis=0)
    
    return tables[['COURSE', 'PROF', 'GPA', 'A', 'B', 'C', 'D', 'F', 'A (%)', 'B (%)', 'C (%)', 'D (%)', 'F (%)', 'Q']]

In [62]:
all_grd = []
done = False

count = 0
for url in pdf_urls:
    try:
        all_grd.append(get_tables(url))
        print(f"On URL: {count}")
        count += 1
    except Exception as e:
        pdf_urls.remove(url)
        logging.warning(f"Exception: {type(e).__name__} : {e} --- URL: {url}")

On URL: 0
On URL: 1
On URL: 2
On URL: 3
On URL: 4
On URL: 5
On URL: 6
On URL: 7
On URL: 8
On URL: 9
On URL: 10
On URL: 11
On URL: 12




On URL: 13
On URL: 14
On URL: 15
On URL: 16
On URL: 17
On URL: 18
On URL: 19
On URL: 20
On URL: 21
On URL: 22
On URL: 23
On URL: 24
On URL: 25
On URL: 26
On URL: 27
On URL: 28
On URL: 29
On URL: 30
On URL: 31
On URL: 32
On URL: 33
On URL: 34
On URL: 35
On URL: 36
On URL: 37
On URL: 38
On URL: 39
On URL: 40
On URL: 41
On URL: 42
On URL: 43
On URL: 44
On URL: 45
On URL: 46
On URL: 47
On URL: 48
On URL: 49
On URL: 50
On URL: 51
On URL: 52
On URL: 53
On URL: 54
On URL: 55
On URL: 56
On URL: 57
On URL: 58
On URL: 59
On URL: 60
On URL: 61
On URL: 62
On URL: 63
On URL: 64
On URL: 65
On URL: 66




On URL: 67
On URL: 68
On URL: 69
On URL: 70
On URL: 71
On URL: 72
On URL: 73
On URL: 74
On URL: 75
On URL: 76
On URL: 77
On URL: 78
On URL: 79
On URL: 80
On URL: 81
On URL: 82
On URL: 83
On URL: 84




On URL: 85
On URL: 86
On URL: 87
On URL: 88
On URL: 89
On URL: 90
On URL: 91
On URL: 92
On URL: 93
On URL: 94
On URL: 95
On URL: 96
On URL: 97
On URL: 98
On URL: 99
On URL: 100
On URL: 101
On URL: 102
On URL: 103




On URL: 104
On URL: 105
On URL: 106
On URL: 107
On URL: 108
On URL: 109
On URL: 110
On URL: 111
On URL: 112
On URL: 113
On URL: 114
On URL: 115
On URL: 116
On URL: 117
On URL: 118
On URL: 119
On URL: 120
On URL: 121
On URL: 122




On URL: 123
On URL: 124
On URL: 125
On URL: 126
On URL: 127
On URL: 128
On URL: 129
On URL: 130
On URL: 131
On URL: 132
On URL: 133
On URL: 134
On URL: 135
On URL: 136
On URL: 137
On URL: 138
On URL: 139
On URL: 140
On URL: 141




On URL: 142
On URL: 143
On URL: 144
On URL: 145
On URL: 146
On URL: 147
On URL: 148
On URL: 149
On URL: 150
On URL: 151
On URL: 152
On URL: 153
On URL: 154
On URL: 155
On URL: 156
On URL: 157
On URL: 158




On URL: 159
On URL: 160
On URL: 161
On URL: 162
On URL: 163
On URL: 164
On URL: 165
On URL: 166
On URL: 167
On URL: 168
On URL: 169
On URL: 170
On URL: 171
On URL: 172
On URL: 173
On URL: 174
On URL: 175
On URL: 176




On URL: 177
On URL: 178
On URL: 179
On URL: 180
On URL: 181
On URL: 182
On URL: 183
On URL: 184
On URL: 185
On URL: 186
On URL: 187
On URL: 188
On URL: 189
On URL: 190
On URL: 191
On URL: 192
On URL: 193
On URL: 194
On URL: 195




On URL: 196
On URL: 197
On URL: 198
On URL: 199


In [70]:
df = pd.concat(all_grd)
df.head()

Unnamed: 0,COURSE,PROF,GPA,A,B,C,D,F,A (%),B (%),C (%),D (%),F (%),Q
0,ASCC 101,MCCLURE M,3.291,15,6,0,1,2,62.5,25.0,0.0,4.17,8.33,0.0
2,ASCC 101,JONES M,3.391,12,10,0,0,1,50.0,41.67,0.0,0.0,4.17,1.0
0,AGCJ 105,WALD D,3.217,9,11,2,1,0,39.13,47.83,8.7,4.35,0.0,0.0
2,AGCJ 105,WALD D,3.09,14,23,4,3,0,31.11,51.11,8.89,6.67,0.0,1.0
6,AGCJ 281,REDWINE T,3.923,36,3,0,0,0,90.0,7.5,0.0,0.0,0.0,0.0


In [72]:
# Mongo DB

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import "../.env"

uri = "mongodb+srv://ashlxyzhang:Pokes-1023@tamugrds.nnsegmo.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)


Pinged your deployment. You successfully connected to MongoDB!


In [68]:
df.to_csv('./grds/all_grds.csv', index=False)