## **Importing Libraries**

In [None]:
# Installing packages to read pdf and document(doc/docx)
# Installing Spacy  for Extracting the important text

!pip install pdfminer.six 
!pip install docx2txt
!pip install python-docx
!pip install spacy

In [3]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

import docx2txt
import io
import spacy
from spacy.matcher import Matcher
import en_core_web_sm

import re

import pandas as pd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Reading PDF as Text**

In [22]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            file_handle.close()


pdfText = ''
# calling above function and extracting text
for page in extract_text_from_pdf('/content/drive/My Drive/Kaggle/ResumeRanker/Resume/AnushaViraj_Resume.pdf'):
    pdfText += ' ' + page

In [23]:
pdfText

' Anusha Viraj \nSenior Automation Engineer \n\n \n\nTotal 8+ years of experience in Quality Engineering backed with experience on creation and maintenance of \nvarious frameworks, understanding client requirements to provide end to end automation solutions. \nProficient knowledge of different automation tools and types of testing across multiple platforms. Hands on \nexperience in ML with knowledge in different types supervised and unsupervised models, model \ndeployments, EDA. Hard working and passionate to learn latest technologies and implement it in the best \npossible way. Ability to work independently as well as in a team environment.  \n \n\nWORK EXPERIENCE \nJP Morgan & Chase \nProject:  Investor Services Test Engineering  \n•  Developed a new framework that can be used within the firm to automate JS based \n\n \n\n \n\n \n\n \n\n               May’ 19 - present \n\n  \n\n• \n\napplications using Cypress and NodeJS server environment. \nEnhanced, maintained, provided support a

## **Reading document as Text**

In [None]:
def extract_text_from_doc(doc_path):
    temp = docx2txt.process(doc_path)
    text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    return ' '.join(text)

text = extract_text_from_doc("/content/drive/My Drive/Kaggle/ResumeRanker/Resume/Rashmi_Java_5Yrs.docx")
text

'Rashmi Kumari Email: er.rash.rashmi@gmail.com Mobile: +91 8904664464       EXPERIENCE SUMMARY I have almost 5 years of diversified and progressively challenging experience Currently working as senior member of technical staff with Oracle India Pvt Ltd (June 2017 to current).  Have worked as Senior System Engineer with Infosys with TOYOTA (May 2016 to June 2017) Have worked with Cognizant(Dec 2013 to May2016) Good knowledge of Core Java and web services. Domain knowledge of BFS (CIB) and MFGADM Successfully delivered projects under strict schedules and quality control.  Versatile team player and individual contributor with strong analytical and problem solving skills        TECHNICAL  SUMMARY Programming Languages –Java, PL/SQL DBMS and Tools – SQL (Oracle 11g), Eclipse, SQL Server Web Based Technologies - JSP, Angular JS. Web/Application Servers – SVN, Maven, Tomcat, GIT Operating Systems – Windows 7, Windows 10,  Unix Basics, Linux Have knowledge of Data structure and algorithm. Have

## **Extracting name fom text**

In [24]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    
    matcher.add('NAME',  [pattern])
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

name = extract_name(pdfText)

In [34]:
name

'Anusha Viraj'

## **Extracting Skills**

In [25]:
requiredSkills = ['automation framework development', 'automation testing', 'jenkins', 'maven', 'java', 'python', 'agile', 'CI/CD', 'postman', 'api', 'restapi', 'rest','testing', 'spring boot framework']

In [26]:
def extract_skills(resume_text):
    nlp_text = nlp(resume_text)
    chunks = nlp_text.noun_chunks

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in requiredSkills:
            skillset.append(token)
    
    # check for bi-grams and tri-grams (example: machine learning)
    for token in chunks:
        token = token.text.lower().strip()
        if token in requiredSkills:
            skillset.append(token)
    
    return [i.capitalize() for i in set([i.lower() for i in skillset])]

candidateSkillset = extract_skills(pdfText)

In [27]:
candidateSkillset

['Agile',
 'Jenkins',
 'Maven',
 'Automation testing',
 'Testing',
 'Java',
 'Python']

## **Extact Mobile Numer**

In [28]:
def extract_mobile_number(text):
    phone = re.findall(re.compile(r'\+?\d[\d -]{8,13}\d'), text)
    return phone

phoneNo = extract_mobile_number(pdfText)

In [29]:
phoneNo

['9543601488']

## **No of skills matching**

In [30]:
print("Total skills required: {}".format(len(requiredSkills)))
print("Out of total, no of skill Candidate meet: {} ".format(len(candidateSkillset)))

Total skills required: 14
Out of total, no of skill Candidate meet: 7 


In [35]:
organizations = []
def extract_companies(text):
  organization = nlp(text)
  for ent in organization.ents:
    if ent.label_ == "ORG":
      organizations.append(ent.text)
    else:
      continue

org = extract_companies(pdfText)
print(organizations)

['Anusha Viraj', 'EDA', 'JP Morgan & Chase', 'JS', 'the LOB for Automation of the UI', 'LOB', 'UI', 'LOB', 'UI', 'Oracle India Pvt Ltd.', 'Web Fusion Application', 'IDE', 'Visual Builder Cloud Service', 'MS Excel', 'Automated ADFDI Excel', 'VBA', 'Sony India Software Center Pvt.', 'Health Check', 'Adobe CQ', 'Project', 'Sprint', 'Sprint Demo', 'NGPOS', 'Self Service', 'Daily Status Report', 'Weekly Status Report', 'JavaScript', 'Python', 'SVN', 'BugDB', 'Adobe CQ5', 'TensorFlow', 'Keras', 'NumPy, Pandas', 'EDA', 'Tableau', 'CERTIFICATIONS', '▪ IBM Certification for', '▪ ISTQB Certified Professional', 'Sun', 'Java Programmer', 'DETAILS', 'ML \n', 'Sathyabama University     ']


In [None]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import timedelta


# 4,52,568 unique companies found / 30 per page = 15086 pages
total_number_of_webpages = 15086


start_time = time.time()
dataframe_final = pd.DataFrame()

for page in range(1, total_number_of_webpages+1):
    print("scraping webpage number: {page} of {total}".format(page=page, total=total_number_of_webpages))
    loop_time = time.time()
    
    # set page url and header
    url = "https://www.ambitionbox.com/list-of-companies?page={}".format(page)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
    
    # get page response from the website
    response = requests.get(url, headers=header)
    # time.sleep(0.1)
    
    # pass the page to BeautifulSoup
    soup = BeautifulSoup(response.text, 'lxml')
    
    # find all the company cards from the webpage
    company_cards = soup.find_all("div", class_="company-content-wrapper")
    
    # extract all the required data from each company card and store them in a list
    name = []
    
    # scrap scrap scrap!
    for card in company_cards:
        # 1. name
        try:
            name.append(card.find("h2").text.strip())
        except:
            name.append(None)

    
    # pass the dictionary to pandas to create a dataframe (page)
    df = pd.DataFrame(name, columns = ['Company Name'])
    
    # append the dataframe to the final dataframe (the whole website)
    dataframe_final = dataframe_final.append(df, ignore_index=True)
    
    # success
    print("success!")
    print("time taken:", round((time.time()-loop_time)*1000, 2), "ms")
    print("total time elapsed:", str(timedelta(seconds=(time.time()-start_time))))
    print()

end_time = time.time()
print("full website scraped successfully!")
print("total time taken:", str(timedelta(seconds=(end_time - start_time))))
print()


# export the data to external csv
dataframe_final.to_csv("List_of_companies_in_India.csv", encoding="utf-8")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
success!
time taken: 401.64 ms
total time elapsed: 0:53:34.056981

scraping webpage number: 7820 of 15086
success!
time taken: 408.02 ms
total time elapsed: 0:53:34.465225

scraping webpage number: 7821 of 15086
success!
time taken: 368.5 ms
total time elapsed: 0:53:34.834062

scraping webpage number: 7822 of 15086
success!
time taken: 360.71 ms
total time elapsed: 0:53:35.196599

scraping webpage number: 7823 of 15086
success!
time taken: 365.64 ms
total time elapsed: 0:53:35.562990

scraping webpage number: 7824 of 15086
success!
time taken: 341.39 ms
total time elapsed: 0:53:35.906343

scraping webpage number: 7825 of 15086
success!
time taken: 343.79 ms
total time elapsed: 0:53:36.250860

scraping webpage number: 7826 of 15086
success!
time taken: 440.86 ms
total time elapsed: 0:53:36.692552

scraping webpage number: 7827 of 15086
success!
time taken: 381.91 ms
total time elapsed: 0:53:37.075172

scraping webpage numb