In [28]:
!pip install beautifulsoup4



In [29]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import re
import pandas 

In [None]:
class MonsterAnalyse(object):
    """Summary of the class here.

    This class will get the job information based on keyword and location from
    www.indeed.ca. You can access the attributes below.

    Attributes:
        keyword: Job query keywords and no default value.
        location: Job query location and default value is 'toronto'.
        pagenum: Number of pages you want to query. Every page has 20 jobs.
            Default number is 10.
        jobs: Jobs pool. Every job is a dictionary datatype stored in jobs
            disctionary. The key of each job is 'id', and value contains:
            job title, company name, job's link, short summary and long
            summary.
        df: If the args:flag in __init__ is Ture, all the job information will be 
            stored in a csv file. And then load the data into dataframe:df. 
            If flag is False, there is no df.
    """
    def __init__(self, keyword, location='Toronto', pagenum=10, flag=True):
        """initialize query information and start query to get job information

        First, initialize the query information like job keywords, job location
        and the number of pages you want to query. Then use function:
        link_construct() to initialize the links you want to query. At last,
        traverse and analyse all the links to get the job information.

        Args:
            keyword: Job query keywords and no default value.
            location: Job query location and default value is 'toronto'.
            pagenum: Number of pages you want to query. Every page has 20 jobs.
                Default number is 10.
            flag: Whether write job information into a csv file and load data 
                into dataframe. Default value id True.
        """
        self.keyword = keyword
        self.location = location
        self.pagenum = pagenum
        self.flag = flag
        self.jobs = {}
        self.__monster = "https://www.monster.ca"
        self.__links = []
        self.link_construct()
        for link in self.__links:
            self.page_analyse(link)
        self.write_into_csv(self.flag)
        
    def link_construct(self):
        """Construct indeed urls prepare to div_analyse

        Urls in www.indeed.ca is regular and we can construct urls based on
        rules. So after this function, we can get a list of links:__links to
        analyse.
        """
        #https://www.monster.ca/jobs/search/?q=data&where=Toronto&jobid=193811123
        
        link = (
            self.__monster + '/jobs/search/?q=' + self.keyword.replace(' ', '+')
            + '&where=' + self.location.replace(' ', '+')
            + '&page=' + str(self.pagenum)
        )
        self.__links.append(link)
        print('Link Construct Finish')
        
    def page_analyse(self, link):
        """Analyse single query result link

        Analyse link from self.__link. First, find every job information by
        tag class name or tag id. And then analyse tag block by div_analyse.

        Args:
            link: link from self.__link
        """
        print('Analysing link: ' + link + '\nPlease wait...')
        print(link)
        content = urlopen(link).read()
        soup = BeautifulSoup(content, 'lxml')

        #job_divs = soup.find(attrs={'id': 'resultsCol'}).find_all('div', {'class': re.compile("\srow result")})
        job_divs = soup.find_all('div',{'class':'flex-row'})
        for item in job_divs:
            self.div_analyse(item)
        print('Finish')

    def div_analyse(self, jobdiv):
        """Analyse single job information in html tag block

        Extract job information from html tag block. information include job
        title, company name, url in indeed, id, short summary and long summary.
        Long summary extract from url of job in indeed. Other information
        extract from jobdiv.

        Args:
            jobdiv: html tag block contains single job infomation

        Raises:
            AttributeError: An error occured when extract long summary. If
                there is no tag has id:job_summary in url of job, this error
                will be catched and search html tag which class is
                jobsearch-JobComponent-description.
        """
        job = {}
        key = jobdiv.find(class_="title").a['data-m_impr_j_jobid']
        job['company'] = jobdiv.find(class_="company").span.get_text()
        job['title'] = jobdiv.find(class_="title").span.get_text()
        job['link'] = jobdiv.find(class_="title").a["href"]
        
        
        job_content = urlopen(job['link']).read()
        job_soup = BeautifulSoup(job_content, 'lxml')
        try:
            job['summary'] = job_soup.find(id='JobDescription').get_text().strip('\t\n\r').replace('\n', ' ')
        except AttributeError as error:
            job['summary'] = job_soup.find(class_='jobsearch-JobComponent-description').get_text().strip('\t\n\r').replace('\n', ' ')
            print('no span:job_summary switch to class:jobsearch-JobComponent-description')
        if key in self.jobs:
            if self.jobs[key]['link'] != job['link']:
                print('WRONG: Same ID with different link')
        else:
            self.jobs[key] = job
    
    def write_into_csv(self, flag):
        if flag:
            #try:
            filename = self.keyword + "In" + self.location + "FromMonster"
            with open(filename + ".csv", "w", encoding="utf-8") as toWrite:
                writer = csv.writer(toWrite, delimiter=",")
                writer.writerow(["Id", "Job Title", 'company', "Summary", "Link"])
                for key in self.jobs.keys():
                    writer.writerow([key, self.jobs[key]['title'], self.jobs[key]['company'], self.jobs[key]["summary"], self.jobs[key]["link"]])
            print("Already write job information into a csv file:" + filename + '.csv')
            print("You can use self.df for further processing.")
            self.df = pandas.read_csv(filename+'.csv')
            self.df
        else:
            return
    
s = IndeedAnalyse('data', 'Toronto', 2)
print(len(s.jobs))