In [10]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import date, datetime, timedelta
from tqdm import tqdm
import time

In [11]:
class PickNZCrawler:
    def __init__(self):
        self.total_page = self.total_page()
        # self.total_page = 1
    def start(self):
        result_list = []
        for page_number in tqdm(range(1, self.total_page+1)):
            start_url = f'https://jobs.picknz.co.nz/page/{page_number}/'
            result_list = result_list + self.get_all_urls(start_url)
            time.sleep(60)
        return result_list
    def total_page(self):
        response = requests.get('https://jobs.picknz.co.nz')
        soup = BeautifulSoup(response.text)
        total_pages = int(soup.find_all('a', class_="page-numbers")[-2].get_text())
        return total_pages
    
    def get_all_urls(self, input_url):
        page_list = []
        response = requests.get(input_url)
        soup = BeautifulSoup(response.text)
        soup_urls = soup.find_all("td", class_="wpjb-column-title")
        for soup_url in tqdm(soup_urls):
            url = soup_url.find('a').get('href')
            page_list.append(self.parse_each_job(url))
        return page_list

    def parse_each_job(self, input_url):
        result_dict = {}
        response = requests.get(input_url)
        soup = BeautifulSoup(response.text)
        title = soup.title.get_text().split(' - Jobs')[0]
        view_count = soup.find('div', class_='wpjb-grid-col wpjb-col-65').get_text().strip()
        employer_name = soup.find('span', class_='vcard author').find('span', class_='fn').get_text().strip()
        location = soup.find_all('td', {'class': 'wpjb-info-label'})[1].find_next().get_text().strip()
        post_date = soup.find_all('td', {'class': 'wpjb-info-label'})[2].find_next().get_text().strip()
        job_category = soup.find_all('td', {'class': 'wpjb-info-label'})[3].find_next().get_text().strip()
        job_type = soup.find_all('td', {'class': 'wpjb-info-label'})[4].find_next().get_text().strip()
        number_of_vacancies = soup.find_all('td', {'class': 'wpjb-info-label'})[5].find_next().get_text().strip()
        start_date = soup.find_all('td', {'class': 'wpjb-info-label'})[6].find_next().get_text().strip()
        end_date = soup.find_all('td', {'class': 'wpjb-info-label'})[7].find_next().get_text().strip()
        rate = soup.find_all('td', {'class': 'wpjb-info-label'})[8].find_next().get_text().strip()
        try:
            benefits = soup.find_all('td', {'class': 'wpjb-info-label'})[9].find_next().get_text().strip()
        except IndexError:
            benefits = None
        description = soup.find('meta', {'property': 'og:description'}).get('content')
        result_dict['job_title'] =  title
        result_dict['view_count'] =  int(view_count)
        result_dict['employer_name'] =  employer_name
        result_dict['location'] =  location
        result_dict['post_date'] =  datetime.strptime(post_date, '%d/%m/%Y').date()
        result_dict['job_category'] =  job_category
        result_dict['job_type'] =  job_type
        result_dict['number_of_vacancies'] =  number_of_vacancies
        result_dict['start_date'] =  datetime.strptime(start_date, '%Y/%m/%d').date()
        result_dict['end_date'] =  datetime.strptime(end_date, '%Y/%m/%d').date()
        result_dict['rate'] =  rate
        result_dict['benefits'] =  benefits
        result_dict['description'] =  description
        return result_dict

In [None]:
if __name__=="__main__":
    spider = PickNZCrawler()
    df = spider.start()

In [13]:
spider = PickNZCrawler()
list_1 = spider.get_all_urls('https://jobs.picknz.co.nz/page/1')

100%|███████████████████████████████████████████| 20/20 [01:17<00:00,  3.89s/it]


In [14]:
list_2 = spider.get_all_urls('https://jobs.picknz.co.nz/page/2')

100%|███████████████████████████████████████████| 20/20 [01:23<00:00,  4.18s/it]


In [15]:
list_3 = spider.get_all_urls('https://jobs.picknz.co.nz/page/3')

100%|███████████████████████████████████████████| 20/20 [01:20<00:00,  4.00s/it]


In [16]:
list_4 = spider.get_all_urls('https://jobs.picknz.co.nz/page/4')

100%|███████████████████████████████████████████| 20/20 [01:21<00:00,  4.10s/it]


In [17]:
list_5 = spider.get_all_urls('https://jobs.picknz.co.nz/page/5')

100%|███████████████████████████████████████████| 20/20 [01:25<00:00,  4.26s/it]


In [18]:
list_6 = spider.get_all_urls('https://jobs.picknz.co.nz/page/6')

100%|███████████████████████████████████████████| 20/20 [01:24<00:00,  4.21s/it]


In [19]:
list_7 = spider.get_all_urls('https://jobs.picknz.co.nz/page/7')

100%|███████████████████████████████████████████| 20/20 [01:37<00:00,  4.89s/it]


In [20]:
list_8 = spider.get_all_urls('https://jobs.picknz.co.nz/page/8')

100%|███████████████████████████████████████████| 20/20 [02:49<00:00,  8.46s/it]


In [21]:
list_9 = spider.get_all_urls('https://jobs.picknz.co.nz/page/9')

100%|███████████████████████████████████████████| 20/20 [01:58<00:00,  5.93s/it]


In [38]:
list_10 = spider.get_all_urls('https://jobs.picknz.co.nz/page/10')

100%|███████████████████████████████████████████| 20/20 [01:23<00:00,  4.20s/it]


In [39]:
list_11 = spider.get_all_urls('https://jobs.picknz.co.nz/page/11')

100%|███████████████████████████████████████████| 20/20 [01:26<00:00,  4.34s/it]


In [40]:
list_12 = spider.get_all_urls('https://jobs.picknz.co.nz/page/12')

100%|███████████████████████████████████████████| 20/20 [01:30<00:00,  4.53s/it]


In [41]:
list_13 = spider.get_all_urls('https://jobs.picknz.co.nz/page/13')

100%|█████████████████████████████████████████████| 3/3 [00:11<00:00,  3.73s/it]


In [42]:
result_list = list_1 + list_2 + list_3 + list_4 + list_5 + list_6 + list_7 + list_8 + list_9 + list_10 + list_11 + list_12 + list_13

In [43]:
df = pd.DataFrame(result_list)

In [49]:
df.post_date.min()

datetime.date(2021, 9, 2)

In [47]:
df.to_csv(f'pick_nz_{date.today().year}{date.today().month}{date.today().day}.csv')
df.to_csv(f'pick_nz.csv')