# Unofficial Transcript PDF Scraper
The goal is to take in an unofficial Virginia Tech transcript and return information about coursework and performance.

In [114]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [183]:
# Pdfminer imports
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

## Convert PDF to Text

In [884]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [885]:
# Run function
text = convert_pdf_to_txt('UnofficialAcademicTranscript.pdf')

In [886]:
text[0:1000]

'7/22/2018\n\nUnofﬁcial Academic Transcipt\n\nDisplay\xa0Unofficial\xa0Transcript\n\n\xa0\n\n07/22/18\xa004:01\xa0PM\n\xa0\n\nThis\xa0is\xa0NOT\xa0an\xa0official\xa0transcript.\xa0Courses\xa0which\xa0are\xa0in\xa0progress\xa0may\xa0also\xa0be\xa0included\xa0on\xa0this\xa0transcript.\n\nTransfer\xa0Credit\xa0\xa0\xa0\xa0Institution\xa0Credit\xa0\xa0\xa0\xa0Transcript\xa0Totals\xa0\xa0\xa0\xa0Courses\xa0in\xa0Progress\n\nTranscript\xa0Data\nSTUDENT\xa0INFORMATION\n\nName\xa0: Conor\xa0F.\xa0Dewey\n\nStudent\xa0Type: Continuing\n\n\xa0\n***Transcript\xa0type:WEB\xa0is\xa0NOT\xa0Official\xa0***\n\xa0\n\xa0\n\xa0\nTRANSFER\xa0CREDIT\xa0ACCEPTED\xa0BY\xa0INSTITUTION\xa0\xa0\xa0\xa0\xa0\xa0\xadTop\xad\n\nNorthern\xa0Va\xa0Cmty\xa0Coll\xadAnnandal\n\nFS13\xad\nSS14:\n\nSubject Course\n\nTitle\n\nXXXX\nXXXX\n\nVT\nVT\n\xa0\n\nNo\xa0VT\xa0Equivalent\nNo\xa0VT\xa0Equivalent\nAttempt\nHours\n\nPassed\nHours\n\nGrade\n\nT\nT\nEarned\nHours\n\nQuality\xa0Points\n\nCredit\nHours\n\n0.000\xa0\xa0\n0.0

## Scrape Relevant Information

In [899]:
# Split by semester
split_by_semester = text.split('Primary\nCollege:')

In [900]:
# Grade list
grade_list = ['A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F', 'P', 'W', 'IP', 'AP', 'NT', 'NP'] 

In [901]:
# Initializing lists
semesters = []
subjects = []
numbers = []
names = []
grades = []
credits = []

In [902]:
# Loop through
for index1, item in enumerate(split_by_semester[1:len(split_by_semester)-1]):
    
    # Split by lines within semester
    temp = split_by_semester[index1 + 1].split('\n')
    
    # Data cleaning
    temp = list(filter(None, temp))
    temp = [item.replace('\xad', '') for item in temp if item not in ('Blacksburg UG', 'UG', 'Virtual')]
    
    # Course subjects
    count = 0
    for index, item in enumerate(temp):
        if item == item.upper() and item[0].isalpha() and item not in ('GPA', 'UG') and item not in grade_list:
            last_index = index
            semesters.append(index1)
            subjects.append(item)
            count += 1
    
    # Course numbers
    num_courses = count
    stop = last_index + num_courses + 1
    for index, item in enumerate(temp[last_index + 1: stop]):
        if item.isdigit():
            numbers.append(item)
            last_index += 1
      
    # Course names
    stop = last_index + num_courses + 1
    for index, item in enumerate(temp[last_index + 1: stop]):
        if len(item) > 1:
            names.append(item)
            
    # Course grades
    for index, item in enumerate(temp):
        if item in grade_list:
            grades.append(item)
            last_index = index
        
    # Course credits
    stop = last_index + num_courses + 1
    for index, item in enumerate(temp[last_index + 1: stop]):
        credits.append(item)

## Create DataFrame

In [903]:
# Create dataframe
df = pd.DataFrame({'Semester':semesters, 'Subject':subjects, 'Course':numbers, 
                   'Title':names, 'Grade':grades, 'Credits':credits}, 
                  columns=['Semester', 'Subject', 'Course', 'Title', 'Grade', 'Credits'])

In [904]:
df.head()

Unnamed: 0,Semester,Subject,Course,Title,Grade,Credits
0,0,CHEM,1035,General Chemistry,C+,3.0
1,0,CHEM,1045,General Chemistry Lab,B,1.0
2,0,CS,2114,Softw Des & Data Structures,C,3.0
3,0,ENGE,1215,Foundations of Engineering,C+,2.0
4,0,ENGE,1354,Intro to Spatial Visualization,C+,1.0
