# Unofficial Transcript PDF Scraper
The goal is to take in an unofficial Virginia Tech transcript and return information about coursework and performance.

In [1191]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [1192]:
# Pdfminer imports
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

## Convert PDF to Text

In [1193]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [1194]:
# Run function
text = convert_pdf_to_txt('UnofficialAcademicTranscript.pdf')

## Scrape Relevant Information

In [1195]:
# Split by semester
split_by_semester = text.replace('Primary College:', 'Primary\nCollege:').split('Primary\nCollege:')

In [1196]:
# Grade list
grade_list = ['A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F', 'P', 'W', 'IP', 'AP', 'NT', 'NP', 'T'] 

In [1197]:
# Initializing lists
semesters = []
subjects = []
numbers = []
names = []
grades = []
credits = []

In [1198]:
# Loop through
for index1, item in enumerate(split_by_semester[1:len(split_by_semester)-1]):
    
    # Split by lines within semester
    temp = split_by_semester[index1 + 1].split('\n')
    
    # Data cleaning
    temp = list(filter(None, temp))
    temp = [item.replace('\xad', '') for item in temp if item not in ('Blacksburg UG', 'UG', 'Virtual')]
    
    # Course subjects
    count = 0
    for index, item in enumerate(temp):
        if item == item.upper() and len(item) <= 4 and item[0].isalpha() and item not in ('GPA', 'UG') and item not in grade_list:
            last_index = index
            semesters.append(index1)
            subjects.append(item)
            count += 1
    
    # Course numbers
    num_courses = count
    stop = last_index + num_courses + 1
    for index, item in enumerate(temp[last_index + 1: stop]):
        if item.isdigit():
            numbers.append(item)
            last_index += 1
            
    # Course names
    stop = last_index + num_courses + 1
    for index, item in enumerate(temp[last_index + 1: stop]):
        if len(item) > 1:
            names.append(item)
            
    # Course grades
    for index, item in enumerate(temp):
        if item in grade_list:
            grades.append(item)
            last_index = index
        
    # Course credits
    stop = last_index + num_courses + 1
    for index, item in enumerate(temp[last_index + 1: stop]):
        credits.append(item)

## Create DataFrame

In [1199]:
# Create dataframe
df = pd.DataFrame({'Semester':semesters, 'Subject':subjects, 'Course':numbers, 
                   'Title':names, 'Grade':grades, 'Credits':credits}, 
                  columns=['Semester', 'Subject', 'Course', 'Title', 'Grade', 'Credits'])

In [1200]:
df

Unnamed: 0,Semester,Subject,Course,Title,Grade,Credits
0,0,CHEM,1035,General Chemistry,C+,3.0
1,0,CHEM,1045,General Chemistry Lab,B,1.0
2,0,CS,2114,Softw Des & Data Structures,C,3.0
3,0,ENGE,1215,Foundations of Engineering,C+,2.0
4,0,ENGE,1354,Intro to Spatial Visualization,C+,1.0
5,0,MATH,1226,Calculus of a Single Variable,C,4.0
6,0,PHIL,1204,Knowledge and Reality,B,3.0
7,1,COMM,2004,Public Speaking,A,3.0
8,1,ENGE,1216,Foundations of Engineering,B,2.0
9,1,GEOG,1014,World Regions,A,3.0
