In [97]:
import PyPDF2 as pypdf
import regex as re
import numpy as np
import csv
import pandas as pd

In [98]:
PDFfile = open("aamu2019-2020.pdf", "rb")
pdfread = pypdf.PdfFileReader(PDFfile)
pdfread

<PyPDF2.pdf.PdfFileReader at 0x7f6c0716c9b0>

In [99]:
y = pdfread.getPage(218)
y.extractText().replace('\n', ' ')

'COURSE DESCRIPTIONS, AAMU Undergraduate Bulletin ,  2019 - 2020   ~  219   ~     ACC 40 3   Advanced Cost Accounting     3  credit  hours . A study of the application of cost accounting data to managerial planning  and control, emphasizing special purpose cost accounting statement and recent developments in the use of quantitative  tools in management   decision - making. Recommended for accounting  majors. Prerequisites: ACC 303.     ACC 421   Advanced Accounting     3  credit  hours . A detailed study of the accounting principles and problems related to  partnerships and business combinations. A substantial part of  the course is devoted to student\'s reports on other advanced  accounting   topics. Prerequisites: ACC 30 1 .   Corequisites: ACC 302.     ACC 441   Auditing I     3  credit  hours . The study of generally accepted auditing standards and procedures underlying the  certification of financial statements by certified public accountants. Through problems and cases, the stude

In [100]:
pdfread.getNumPages()

357

In [101]:
def get_data(csv_name, pdf_name, page_range, pattern, dep_group, abbrev_group, c_group, t_group, desc_group,
            year, grad_status):
    """Takes in name of PDF in local folder and extracts data in given PAGE_RANGE 
    using regex expression PATTERN and given group numbers. YEAR is the catalog year.
    GRAD_STATUS is either None (if unknown), 'G', 'UG', or a number indicating at
    which number is the course graduate."""
    
    PDFfile = open(pdf_name, "rb")
    pdfread = pypdf.PdfFileReader(PDFfile)
    
    csv_writer = csv.writer(open(csv_name, 'w'))
    csv_writer.writerow(["Department Name", "Course Catalogue Number", "Course Name",
     "Course Description", "Graduate/Undergraduate", "Format", "Lab", 
     "Academic Catalogue Year"])
    FS_keys = ["agri", "food", "animal"]
    
    dep_names = {}
    for i in np.arange(page_range[0], page_range[1]):
        page_text = pdfread.getPage(i).extractText().replace('\n', ' ')
        matches = re.findall(pattern, page_text)
        for match in matches:  
            print(match)
            # Getting full department name
            if match[dep_group - 1] != '':
                dep_names[match[abbrev_group - 1]] = match[dep_group - 1]
            dep_name = dep_names.get(match[abbrev_group - 1])
            
            # Other groups
            course_num = int(match[c_group - 1][:3])
            title = str(match[t_group - 1])
            desc = str(match[desc_group - 1])
            
            #Filtering
            for key in FS_keys:
                if (re.search(key, title, flags=re.IGNORECASE) or 
                    re.search(key, desc, flags=re.IGNORECASE)):
                    G_UG = grad_status
                    if grad_status:
                        if type(grad_status) == int:
                            G_UG = "UG"
                            if int(t.group(1)) >= grad_status:
                                G_UG = "G"
                    lab = False
                    if (re.search(r'Laboratory|Lab', title, flags=re.IGNORECASE) or 
                    re.search(r'Laboratory|Lab', desc, flags=re.IGNORECASE)):
                        lab = True
                    csv_writer.writerow([dep_name, course_num, title, desc, 
                                             G_UG, None, lab, year])
                                            # Leaving format blank for now
            
    
aamu_pattern = r'(\w+\s{3})?([a-zA-Z]+)\s([\d]+\w?)\s{3}([^\d]+)\d\s+credit\s+hours?(.+?)\s{5}' 
#aamu_last_pattern = r'(\w+\s{3})?([a-zA-Z]+)\s(\d+)\s{3}(.+?)\d\s\scredit\s\shours\s\.(.+)'
#Full set: get_data(pdfread, [217, 357], aamu_pattern, 1, 2, 3, 4, 5)
#Partial Test set:
get_data('Alabama_A&M_University_Sample.csv', "aamu2019-2020.pdf", [217, 228], aamu_pattern, 
         1, 2, 3, 4, 5, '2019-2020', 'UG')

('Accounting   ', 'ACC', '203', 'Introduction to Accounting I     ', ' . Basic conc epts with a focus on how accounting events affect financial  statements. Emphasizes both preparation and use of external financial reports. Topics include accrual versus cash,  receivables, payables, inventory, long - term operational assets, long - term liabili  procedures, and financial statement   analysis. Prerequisites: None.')
('', 'ACC', '204', 'Introduction to Accounting II     ', ' . Accounting II is a continuation of ACC 203. Basic concepts associated  with managerial accounting. Emphasizes the use of relevant information for planning, control and decision - making.  Topics include cost behavior, cost allocation, product costing, budget ing, responsibility accounting, and capital  budgeting. Prerequisites: ACC 203 .')
('', 'ACC', '219', 'Managerial Accounting     ', ' . A study of the uses of accounting information for planning and control in an  enterprise area of study including budgeting, f

('', 'AGB', '405', 'Extension Methods     ', ' . Principles and procedures in developing extension programs in agriculture, with  emphasis on program determination, teaching  methods and relationship with teaching adults in the life - learning process.  Prerequisites: None .')
('', 'AGB', '418', 'Agricultural Leadership     ', ' . Development of skills, qualities, and behaviors which enable effective  leadership, study of group and organi zation function, interpersonal relationships, teaming and leadership in various  organizational settings. Prerequisites: None .')
('', 'AGB', '420', 'Agricultural Cooperatives     ', ' . This course explores the functioning, management, and role of cooperatives  in   agriculture. It is designed to provide students with greater appreciation of the economic and legal underpinning of  institutional arrangements in agriculture and of the potential role such arrangements may play in solving many of the  pressing problems in  production and marketing of agr

('', 'ART', '299', 'Advanced Digital  Photography      ', '   ( 6 clock hours per week) .  Highly technical course consists mainly of  technic al applications, readings, quizzes and discussion. Focuses on learning advanced image - editing software such as  Adobe Photoshop, and photo management and manipulation software .   Covers montage and color management systems  including profiling for color printing and monitors. Emphasis is on composition and print quality as well as creative  expression.   Prerequisites : ART 298 or equivalent .')
('', 'ART', '305', 'Ceramics I      ', '   (6 clo ck hours per week) .  An investigation of the basic processes, methods, and materials of  working in clay. Our primary emphasis is hands - on studio work, with frequent demonstrations of primarily hand - building methods and decorating technique. In lectures we w ill explore historical and contemporary hand - built forms and  decoration as they relate to specific assignments. Assigned projects are desi

('', 'ART', '403', 'Classical Art      ', ' .   A study of the art and archi tecture of ancient Greece and its influence on the development  of the visual arts of the Roman Empire. Prerequisites:   Junior or senior standing .')
('', 'ART', '404', 'Medieval Art      ', ' .   A study of the influence of Christianity on the art of the  western world as expressed in  early Christian, Romanesque, and Gothic architecture, sculpture, and painting. Prerequisites:   Junior or senior standing .')
('', 'ART', '405', 'Renaissance Art      ', ' .   A study of the visual arts of Italy from 1250 to 1550, taking  into consideration the rise  of the artist as a creative individual and his expanding role in society. Prerequisites:   Junior or senior standing .')
('', 'ART', '406', 'Fashion Illustration      ', '   (6 clock hours per week) .   A course focusing on  developing originality and creative   which the clothing is to be worn.   Prerequisites:  ( ART 110,  ART  209,  ART  309)   or  instructor  

In [102]:
pd.read_csv('Alabama_A&M_University_Sample.csv')

Unnamed: 0,Department Name,Course Catalogue Number,Course Name,Course Description,Graduate/Undergraduate,Format,Lab,Academic Catalogue Year
0,Agribusiness,102,Introduction to Careers in Agriculture,. This course provides the agribusiness stud...,UG,,False,2019-2020
1,Agribusiness,199,Computers in Agriculture,. This course is designed for the freshman s...,UG,,True,2019-2020
2,Agribusiness,221,Introduction to Agricultural Economics,. An introduction to the field of agricultura...,UG,,False,2019-2020
3,Agribusiness,299,Quantitative Applications in Agribusiness,. This course is an introduction to quantitat...,UG,,False,2019-2020
4,Agribusiness,300,Agribusiness Statistics,. An introduction to sources and methods of c...,UG,,False,2019-2020
5,Agribusiness,302,Organization and Administration of Career Tech...,". A course dealing with identification, expl...",UG,,False,2019-2020
6,Agribusiness,323,Agricultural Marketing,. Provides a critical analysis of method s em...,UG,,False,2019-2020
7,Agribusiness,333,Commodity Marketing,. Focus on using futures markets in managing ...,UG,,False,2019-2020
8,Agribusiness,405,Extension Methods,. Principles and procedures in developing ext...,UG,,False,2019-2020
9,Agribusiness,418,Agricultural Leadership,". Development of skills, qualities, and behav...",UG,,False,2019-2020


In [103]:
sample = 'Accounting   ACC 203   Introduction to Accounting I     3  credit  hours . Basic conc epts with a focus on how accounting events affect financial  statements. Emphasizes both preparation and use of external financial reports. Topics include accrual versus cash,  receivables, payables, inventory, long - term operational assets, long - term liabili  procedures, and financial statement   analysis. Prerequisites: None.     ACC 204   Introduction to Accounting II     3  credit  hours . Accounting II is a continuation of ACC 203. Basic concepts associated  with managerial accounting. Emphasizes the use of relevant information for planning, control and decision - making.  Topics include cost behavior, cost allocation, product costing, budget ing, responsibility accounting, and capital  budgeting. Prerequisites: ACC 203 .     ACC 219   Managerial Accounting     3  credit  hours . A study of the uses of accounting information for planning and control in an  enterprise area of study including budgeting, finan cial analysis, basic cost accounting reports, and capital budgeting. Not  for accounting majors. Prerequisites: ACC 203 ,   ACC  204 .     ACC 301   Intermediate Accounting I     3  credit  hours . A study of financial reporting theory and process. Each major asset catego ry  is analyzed in balance sheet order. Prerequisites: ACC 203 ,   ACC  204 . '
rest = re.sub(aamu_pattern, '', sample)
test = re.search('(\w+\s{3})?([^\s\d]+)\s(\d+)\s{3}(.+?)\d\s\scredit\s\shours\s\.(.+)', rest)
print(test.group(2))
print(" ")
print(test.group(3))
print(" ")
print(test.group(4))
print(" ")
print(test.group(5))

ACC
 
301
 
Intermediate Accounting I     
 
 A study of financial reporting theory and process. Each major asset catego ry  is analyzed in balance sheet order. Prerequisites: ACC 203 ,   ACC  204 . 


In [104]:
re.findall(aamu_pattern, sample)[0][1]

'ACC'

In [105]:
'(\w+\s{3})?([a-zA-Z]+)\s([\d\s]+\w?)\s{3}([^\d]+)\d\s+credit\s+hours?\s?\.\s+(.+?)\s{5}'

'(\\w+\\s{3})?([a-zA-Z]+)\\s([\\d\\s]+\\w?)\\s{3}([^\\d]+)\\d\\s+credit\\s+hours?\\s?\\.\\s+(.+?)\\s{5}'