# Resume Parser for Extracting Contact Information

In [16]:
import re
import pandas as pd

import spacy
NER = spacy.load("en_core_web_lg")

import PyPDF2
import docx2txt
import glob

### Returning the Number

In [17]:
# potential regex text
# "^\\+?\\d{1,4}?[-.\\s]?\\(?\\d{1,3}?\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}$"
# "\b\d{3}[-\.\s]?\d{3}[-\.\s]?\d{4}\b"
# "\+?\d{1,3}[\s.-]\d{3,4}[\s.-]\d{3,8}"
# "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
# r"\+?\d{1,4}(?:[-.]\d{3}){2,3}"

# these return number only, with no - or ' ' or anything.
#r"\d{8,14}"
#r"\+?\d{8,14}"

# this one works but only with numbers with dashes
# "\+?\d{1,4}[\s.-]\d{3,4}[\s.-]\d{3,8}"

# best one so far
# "\+?\d{1,4}[-.\s]?\d{3,4}[-.\s]?\d{3,8}"
# some cvs are weird so I came up with this one
# "\+?\d{2,4}[-.\s]?\d{2,4}[-.\s]?\d{2,4}[-.\s]?\d{2,4}"

In [3]:
def get_phone_numbers(text):
    phone_regex = "\+?\d{2,4}[-.\s]?\d{2,4}[-.\s]?\d{2,4}[-.\s]?\d{2,4}"
    match = re.search(phone_regex, text)
    if match:
        return match.group(0)
    else:
        return None      

In [18]:
get_phone_numbers('+962 78 7184 888, this is the phone number for zaid hani allwansah')

'+962 78 7184 888'

### Returning the Email

In [19]:
def get_email(text):
    email_regex = r'[\w\.-]+@[\w\.-]+'
    match = re.search(email_regex, text)
    if match:
        return match.group(0)
    else:
        return None

In [20]:
get_email('allwazaid1@gmail.com, this is the email for zaid hani allwansah')

'allwazaid1@gmail.com'

### Returning the Names using Spacy

returning the names with regex is extremely hard and very inefficient, that's why I've opted to spacy.

In [21]:
def get_name(text):
    x = NER(text)
    return x.ents[0].text

### Reading PDF Files

In [22]:
def read_pdf(file):
    info_list = []
    x = PyPDF2.PdfReader(file).pages
    text = ''
    try:
        for i in range(5):
            text += x[i].extract_text()
    except Exception as e:
        if str(e)=='sequence index out of range':
            print('-'*30,f'Scanned the {file} PDF File!','-'*30)
        else:
            print(e)
        text = text.replace('\n','  ')
        email = get_email(text)
        name = get_name(text)
        phone = get_phone_numbers(text)
            
        info_list.append(name)
        info_list.append(phone)
        info_list.append(email)
        df.loc[len(df.index)] = info_list

### Reading Word Files

In [23]:
def read_word(file):
    info_list = []
    
    text = docx2txt.process(file)
    text = text.replace('\n','  ')
    email = get_email(text)
    name = get_name(text)
    phone = get_phone_numbers(text)
            
    info_list.append(name)
    info_list.append(phone)
    info_list.append(email)
    df.loc[len(df.index)] = info_list
    print('-'*30,f'Scanned the {file} Word File!','-'*30)

### Reading Text Files

In [24]:
def read_txt(file):
    info_list = []
    with open('resumes/file.txt', 'r') as f:
        text = f.read()
        f.close
    text = text.replace('\n','  ')
    email = get_email(text)
    name = get_name(text)
    phone = get_phone_numbers(text)
            
    info_list.append(name)
    info_list.append(phone)
    info_list.append(email)
    df.loc[len(df.index)] = info_list
    print('-'*30,f'Scanned the {file} Text File!','-'*30)

### Processing the files

In [25]:
files = glob.glob('resumes/*')
files

['resumes\\CV-4.pdf',
 'resumes\\file.txt',
 'resumes\\functionalSample.pdf',
 'resumes\\MAHMOUD-CSV.pdf',
 'resumes\\Resume.docx',
 'resumes\\Resume.pdf']

In [26]:
df = pd.DataFrame(columns=['Candidate Name', 'Phone Number', 'Email Address'])
df

Unnamed: 0,Candidate Name,Phone Number,Email Address


In [27]:
for file in files:
    if file[-3:]=='pdf':
        read_pdf(file)
    elif file[-4:]=='docx':
        read_word(file)
    elif file[-3:]=='txt':
        read_txt(file)
    else:
        print('*'*30,f'{file} Is an Invalid File Format','*'*30)

------------------------------ Scanned the resumes\CV-4.pdf PDF File! ------------------------------
------------------------------ Scanned the resumes\file.txt Text File! ------------------------------
------------------------------ Scanned the resumes\functionalSample.pdf PDF File! ------------------------------
------------------------------ Scanned the resumes\MAHMOUD-CSV.pdf PDF File! ------------------------------
------------------------------ Scanned the resumes\Resume.docx Word File! ------------------------------
------------------------------ Scanned the resumes\Resume.pdf PDF File! ------------------------------


In [28]:
df

Unnamed: 0,Candidate Name,Phone Number,Email Address
0,Mohammad Khaldoun,+962 792152523,kaljermy@gmail.com
1,John Faisal Doe,+962471545454,johndoe1@gmail.com
2,John W. Smith,123 4567 8910,jwsmith@colostate.edu
3,Mahmoud,0791916343,
4,MOHESEN,+9627 8718 4888,allgjdid1@gmail.com
5,Zaid Hani Allwansah,+962787184888,allwazaid1@gmail.com


In [29]:
df.to_csv('Contact_information.csv', index=False)