## Indeed.ca scraping

In [1]:
!pip install beautifulsoup4



In [147]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import re
import pandas
from html.parser import HTMLParser
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [148]:
class IndeedAnalyse(object):
    """Summary of the class here.

    This class will get the job information based on keyword and location from
    www.indeed.ca. You can access the attributes below.

    Attributes:
        keyword: Job query keywords and no default value.
        location: Job query location and default value is 'toronto'.
        pagenum: Number of pages you want to query. Every page has 20 jobs.
            Default number is 10.
        jobs: Jobs pool. Every job is a dictionary datatype stored in jobs
            disctionary. The key of each job is 'id', and value contains:
            job title, company name, job's link, short summary and long
            summary.
        df: If the args:flag in __init__ is Ture, all the job information will be 
            stored in a csv file. And then load the data into dataframe:df. 
            If flag is False, there is no df.
    """
    def __init__(self, keyword, location='Toronto', pagenum=1, flag=True):
        """initialize query information and start query to get job information

        First, initialize the query information like job keywords, job location
        and the number of pages you want to query. Then use function:
        link_construct() to initialize the links you want to query. At last,
        traverse and analyse all the links to get the job information.

        Args:
            keyword: Job query keywords and no default value.
            location: Job query location and default value is 'toronto'.
            pagenum: Number of pages you want to query. Every page has 20 jobs.
                Default number is 10.
            flag: Whether write job information into a csv file and load data 
                into dataframe. Default value id True.
        """
        self.keyword = keyword
        self.location = location
        self.pagenum = pagenum
        self.flag = flag
        self.jobs = {}
        self.__indeed = "https://www.indeed.ca"
        self.__links = []
        self.link_construct()
        for link in self.__links:
            self.page_analyse(link)
        self.write_into_csv(self.flag)
        
    def link_construct(self):
        """Construct indeed urls prepare to div_analyse

        Urls in www.indeed.ca is regular and we can construct urls based on
        rules. So after this function, we can get a list of links:__links to
        analyse.
        """
        for i in range(0, self.pagenum):
            link = (
                self.__indeed + '/jobs?q=' + self.keyword.replace(' ', '+')
                + '&l=' + self.location.replace(' ', '+')
                + '&start=' + str(i*20)
            )
            self.__links.append(link)
        print('Link Construct Finish')
        
    def page_analyse(self, link):
        """Analyse single query result link

        Analyse link from self.__link. First, find every job information by
        tag class name or tag id. And then analyse tag block by div_analyse.

        Args:
            link: link from self.__link
        """
        print('Analysing link: ' + link + '\nPlease wait...')
        print(link)
        content = urlopen(link).read()
        soup = BeautifulSoup(content, 'lxml')
        #print(soup)
        job_divs = soup.find(attrs={'id': 'resultsCol'}).find_all('div', {'class': re.compile("\srow result")})
        for item in job_divs:

            self.div_analyse(item)
        print('Finish')

    def div_analyse(self, jobdiv):
        """Analyse single job information in html tag block

        Extract job information from html tag block. information include job
        title, company name, url in indeed, id, short summary and long summary.
        Long summary extract from url of job in indeed. Other information
        extract from jobdiv.

        Args:
            jobdiv: html tag block contains single job infomation

        Raises:
            AttributeError: An error occured when extract long summary. If
                there is no tag has id:job_summary in url of job, this error
                will be catched and search html tag which class is
                jobsearch-JobComponent-description.
        """
        job = {}
        key = jobdiv['id'][2:]
        job['title'] = jobdiv.find(class_='jobtitle').a['title']
        job['link'] = self.__indeed + jobdiv.find(class_='jobtitle').a['href']
        job['company'] = jobdiv.find(class_ = 'company').get_text().strip(' \t\n\r')
        job['short_summary'] = jobdiv.find(class_ = 'summary').get_text().strip(' \t\n\r')
        job_content = urlopen(job['link']).read()
        job_soup = BeautifulSoup(job_content, 'lxml')
        try:
            job['long_summary'] = job_soup.find(id='job_summary').get_text().strip('\t\n\r').replace('\n', ' ')
        except AttributeError as error:
            job['long_summary'] = job_soup.find(class_='jobsearch-JobComponent-description').get_text().strip('\t\n\r').replace('\n', ' ')
            print('no span:job_summary switch to class:jobsearch-JobComponent-description')
        if key in self.jobs:
            if self.jobs[key]['link'] != job['link']:
                print('WRONG: Same ID with different link')
        else:
            self.jobs[key] = job
    
    def write_into_csv(self, flag):
        if flag:
            #try:
            filename = self.keyword + "In" + self.location + "FromIndeed"
            with open(filename + ".csv", "w", encoding="utf-8") as toWrite:
                writer = csv.writer(toWrite, delimiter=",")
                writer.writerow(["Id", "Job Title", 'company', "Short Summary", "Long Summary", "Link"])
                for key in self.jobs.keys():
                    writer.writerow([key, self.jobs[key]['title'], self.jobs[key]['company'], self.jobs[key]["short_summary"], self.jobs[key]["long_summary"], self.jobs[key]["link"]])
            print("Already write job information into a csv file:" + filename + '.csv')
            print("You can use self.df for further processing.")
            self.df = pandas.read_csv(filename+'.csv')
            self.df
        else:
            return
    
s = IndeedAnalyse('data', 'Toronto', 5)
print(len(s.jobs))

Link Construct Finish
Analysing link: https://www.indeed.ca/jobs?q=data&l=Toronto&start=0
Please wait...
https://www.indeed.ca/jobs?q=data&l=Toronto&start=0
Finish
Analysing link: https://www.indeed.ca/jobs?q=data&l=Toronto&start=20
Please wait...
https://www.indeed.ca/jobs?q=data&l=Toronto&start=20
Finish
Analysing link: https://www.indeed.ca/jobs?q=data&l=Toronto&start=40
Please wait...
https://www.indeed.ca/jobs?q=data&l=Toronto&start=40
Finish
Analysing link: https://www.indeed.ca/jobs?q=data&l=Toronto&start=60
Please wait...
https://www.indeed.ca/jobs?q=data&l=Toronto&start=60
Finish
Analysing link: https://www.indeed.ca/jobs?q=data&l=Toronto&start=80
Please wait...
https://www.indeed.ca/jobs?q=data&l=Toronto&start=80
Finish
Already write job information into a csv file:dataInTorontoFromIndeed.csv
You can use self.df for further processing.
93


In [149]:
s.df.head()

Unnamed: 0,Id,Job Title,company,Short Summary,Long Summary,Link
0,b5737fe55f7b3eaa,Junior Treasury Analyst,Liquid Capital Corp.,2-3 years of work experience in finance and pr...,Junior Treasury AnalystLiquid Capital Corp. – ...,https://www.indeed.ca/company/Liquid-Capital-C...
1,ad62ddc40d502a7d,Data Analytics Analyst (Market Risk Technology),Veritaaq,Given the current focus of both US and Canadia...,**Currently hiring for a top 4 financial insti...,https://www.indeed.ca/company/TBD/jobs/Data-An...
2,c0edbeca8721a306,Data Analyst,Health Quality Ontario,Assist with the design of data verification re...,"Reporting to the Manager, QI Strategies and QI...",https://www.indeed.ca/rc/clk?jk=c0edbeca8721a3...
3,aef737ef8f91785a,Digital Data Analyst (Digital and Data Science),The Globe and Mail,Providing a centralized authority on how to pu...,Overview This role will be instrumental in hel...,https://www.indeed.ca/rc/clk?jk=aef737ef8f9178...
4,ea8e29eaef60858d,"IT Manager, Data/Information Management",TD Bank,Experience in data management and data governa...,About This Role We are looking for someone to ...,https://www.indeed.ca/rc/clk?jk=ea8e29eaef6085...


In [150]:
def data_cleaning(job_df):
    """
    (DataFrame) -> list of strings
    Takes in DataFrame of the job posting, produce a list of job summaries as list of strings
    """
    summary_list = job_df['Long Summary'].tolist()
    
    with open('stop_words.txt',"r") as file2:
        stop_words=file2.readlines()
    stop_words = [e.strip("\n") for e in stop_words]
    
    file_list = summary_list
    
    for i in range(len(file_list)):
        # remove html tags
        cleanr = re.compile('<.*?>')
        file_list[i] = re.sub(cleanr, '', file_list[i]) 
        
        #replace html character codes
        #parser = HTMLParser()
        #html_decoded_string = parser.unescape(file_list[i])
        #file_list[i] = html_decoded_string
        
        #remove urls
        file_list[i] = re.sub(r'http\S+', '', file_list[i])
        
        
        #lowercase characters
        file_list[i] = file_list[i].lower()
        
        #remove punctuation
        tokenizer = RegexpTokenizer(r'\w+')
        file_list[i] = tokenizer.tokenize(file_list[i])
        
        #remove stopwords
        cleaned_text = list(filter(lambda x: x not in stop_words, file_list[i]))
        file_list[i] = cleaned_text
        file_list[i] = " ".join(file_list[i])
    return file_list

def find_top_word(data, n):
    '''
    (list of strings, int) -> DataFrame
    
    frequenct_df:
    Takes in a list of tweets and number n, to select top n words with highest frequency, put 
    them into a DataFrame, where each row is each job summary, each column is each top n word's frequency
    
    frequency_summary_df:
    On the basis of frequency_df, summarize all row values of each column
    '''
    counter = Counter()
    
    for i in range(len(data)):
        counter.update([word for word in re.findall(r'\w+', data[i])])
      
    # select most frequent words           
    top_word = counter.most_common(n)  
    top_word_list= []
    
    for j in range(len(data)):
            temp_counter = Counter([word for word in re.findall(r'\w+', data[j])]) 
            # if the word is in the tweet, count frequency, else frequency = 0
            top_in_summary = [temp_counter[word] if temp_counter[word] > 0 else 0 for (word,wordCount) in top_word]
            # create a list of top n words with highest frequencies
            top_word_list.append(top_in_summary)
               
    frequency_df = pandas.DataFrame(top_word_list)
    header = []
    
    for k in top_word:
        header.append(k[0])
    frequency_df.columns = header
    
    frequency_summary = dict(top_word)
    frequency_summary_df = pandas.DataFrame(frequency_summary, index = [0])
    frequency_summary_df = frequency_summary_df.T.sort_values(0, ascending=False).T
    return frequency_df, frequency_summary_df

top_word,top_word_summary = find_top_word(data_cleaning(s.df),100)
top_word_summary



Unnamed: 0,data,experience,work,business,will,team,skills,working,knowledge,ability,...,engineering,duties,type,risk,driven,ll,operations,providing,1,statistical
0,682,314,234,218,218,195,188,136,125,111,...,38,38,37,37,37,37,37,37,37,36
