In [1]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import spacy 
from spacy.matcher import Matcher
import re
import pandas as pd
from nltk.corpus import stopwords
import os
import io
import nltk
import docx2txt

In [2]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            fake_file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                fake_file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()

def text_from_doc(doc_path):
    '''
    Helper function to extract plain text from .doc or .docx files
    :param doc_path: path to .doc or .docx file to be extracted
    :return: string of extracted text
    '''
    temp = docx2txt.process(doc_path)
    text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    return ' '.join(text)

In [3]:
def extract_text(file_path, extension):
    '''
    Wrapper function to detect the file extension and call text extraction function accordingly
    :param file_path: path of file of which text is to be extracted
    :param extension: extension of file `file_name`
    '''
    text = ''
    if extension == '.pdf':
        for page in extract_text_from_pdf(file_path):
            text += ' ' + page
    elif extension == '.docx' or extension == '.doc':
        text = text_from_doc(file_path)
    return text

In [29]:
def extract_education(nlp_text):
    '''
    Helper function to extract education from spacy nlp text
    :param nlp_text: object of `spacy.tokens.doc.Doc`
    :return: tuple of education degree and year if year if found else only returns education degree
    '''
    # load pre-trained model
    nlp = spacy.load('en_core_web_sm')

    # Grad all general stop words
    STOPWORDS = set(stopwords.words('english'))

    # Education Degrees
    EDUCATION = [
                'BE','B.E.', 'B.E', 'BS', 'B.S', 'B.S.', 
                'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
                'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
                'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII', 'BACHELOR OF SCIENCE'
            ]
    edu = {}
    # Extract education degree
    for index, tex in enumerate(nlp_text.splitlines()):
        # print(tex)
        tex = re.sub(r'[?|$|.|!|,]', r'', tex)
        #print(tex)
        if tex.upper() in EDUCATION:
            edu[tex] = text + nlp_text[index + 1]
        #if tex.upper() in EDUCATION and tex not in STOPWORDS:
        #    edu[tex] = text + nlp_text[index + 1]
    # print(edu)
    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            education.append((key, ''.join(year.group(0))))
        else:
            education.append(key)
    return education

In [76]:
def extract_university(text):
    df = pd.read_csv('world-universities.csv', header=None)
    universities = [i.lower() for i in df[1]]
    college_name = []
    listex = universities
    listsearch = text.lower().splitlines()

    # print(listsearch)
    print(len(listsearch))
    print(len(universities))
    for i in range(len(listex)):
        for ii in range(len(listsearch)):
            #print(listsearch[ii])
            if re.findall(listex[i], re.sub(' +', ' ', listsearch[ii])):
                college_name.append(listex[i])
    return college_name

In [77]:
resume = "TimothyNguyen2022.pdf"
resume_text = extract_text(resume, os.path.splitext(resume)[1])
# print(resume_text)

In [78]:
extract_education(resume_text)

[]

In [79]:
extract_university(resume_text)

86
9276


TypeError: 'set' object is not subscriptable