In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [None]:
#@title Importing required packages

%pip install pdfminer.six
%pip install docx2txt
%pip install resume_parser


from pdfminer.high_level import extract_text
import pandas as pd
import docx2txt
import nltk
import re
import spacy
import os

In [None]:
#@title Downloading required modules


# Downloading required
nltk.download('punkt')
nltk.download('words')

In [None]:
#@title Extracting text from PDF

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [None]:
#@title Extracting text from Doc

def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return None

In [None]:
#@title Extracting name using nltk

def extract_names(txt):
    person_names = []

    for sent in nltk.sent_tokenize(txt):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
                )

    return person_names

In [None]:
#@title Extracting name using Spacy

NER = spacy.load("en_core_web_sm")
def person(txt):
    text1= NER(txt)
    lst = []
    for word in text1.ents:
        if word.label_ == "PERSON":
            lst.append(word.text)
    return lst

In [None]:
#@title Extracting Phone Number

def mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)

    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return '+' + number
        else:
            return number

In [None]:
#@title Extracting Email

EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)

In [None]:
#@title Extracting Education and School using nltk


RESERVED_WORDS = [
    'school',
    'college',
    'univers',
    'academy',
    'faculty',
    'institute',
    'faculdades',
    'Schola',
    'schule',
    'lise',
    'lyceum',
    'lycee',
    'polytechnic',
    'kolej',
    'ünivers',
    'okul',
]


def extract_education(input_text):
    organizations = []

    # first get all the organization names using nltk
    for sent in nltk.sent_tokenize(input_text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                organizations.append(' '.join(c[0] for c in chunk.leaves()))

    # we search for each bigram and trigram for reserved words
    # (college, university etc...)
    education = set()
    for org in organizations:
        for word in RESERVED_WORDS:
            if org.lower().find(word):
                education.add(org)

    return education


In [None]:
#@title Extracting skills using dataset


nlp = spacy.load('en_core_web_sm')
# noun_chunks = nlp.noun_chunks

def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    # reading the csv file
    data = pd.read_csv("/content/skills.csv")

    # extract values
    skills = list(data.columns.values)

    skillset = []

    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)

    return [i.capitalize() for i in set([i.lower() for i in skillset])]


In [None]:
#@title Extracting education and school using Spacy


RESERVED_WORDS = [
    'school',
    'college',
    'univers',
    'academy',
    'faculty',
    'institute',
    'faculdades',
    'Schola',
    'schule',
    'lise',
    'lyceum',
    'lycee',
    'polytechnic',
    'kolej',
    'ünivers',
    'okul',
]


def extract_education_Spacy(input_text, skills):
    organizations = []

    text1= NER(input_text)

    for word in text1.ents:
        # if word.label_ == "ORG":
        if word.label_ == "ORG" and word.text not in skills:
            organizations.append(word.text)

    education = set()
    for org in organizations:
        for word in RESERVED_WORDS:
            if word.lower() in org.lower():
                education.add(org)

    return education


In [None]:
#@title Driver Code


if __name__ == '__main__':

    file_path = '/content/Dishant_Thakkar_Resume.docx'  # replace with your file path
    if os.path.splitext(file_path)[1] == '.pdf':
        pdf_text = extract_text_from_pdf(file_path)

        # Extracting name using nltk
        names = extract_names(pdf_text)
        print("NAME: ", names[0])

        # Extracting name using spacy
        nam = person(pdf_text)
        print("Name : ", nam[0])

        # Extracting phone number
        phone_number = mobile_number(pdf_text)
        print("Phone number : ", phone_number)

        # Extracting email
        email = extract_emails(pdf_text)
        if email:
            print("Email is : ", email)

        # Extracting skills using
        skills = extract_skills(pdf_text)
        print("Skills : ", skills)

        # Extracting education nltk
        education = extract_education(pdf_text)
        print("Education : ", list(education)[0])

        # Extracting education spacy
        education = extract_education_Spacy(pdf_text, skills)
        print("Education : ", list(education)[0])


    elif os.path.splitext(file_path)[1] == '.docx':
        doc_text = extract_text_from_docx(file_path)

        # Extracting name using nltk
        names = extract_names(doc_text)
        print("NAME : ", names[0])

        # Extracting name using spacy
        nam = person(doc_text)
        print("Name : ", nam[0])

        # Extracting phone number
        phone_number = mobile_number(doc_text)
        print("Phone number : ", phone_number)

        # Extracting email
        email = extract_emails(doc_text)
        if email:
            print("Email is : ", email)

        # Extracting skills using
        skills = extract_skills(doc_text)
        print("Skills : ", skills)

        # Extracting education nltk
        education = extract_education(doc_text)
        print("Education : ", list(education)[:2])


        # Extracting education spacy
        education = extract_education_Spacy(doc_text, skills)
        print("Education : ", list(education)[:2])

    else:
        print("Unsupported file type")
