# LinkedIn Scraper

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import regex as re
import pandas as pd
import os
import json

import sys
import warnings

# Suppress the specific warning message
warning_message = "indexing past lexsort depth may impact performance."
warnings.filterwarnings("ignore", message=warning_message)

## Iterate all States

#### URL Parsing
Navigate to the second page of search results for "data analyst" positions in the United States, filter the results by the last month, and include all work types.

![LinkedIn Job Listings](images/linkedin_job_list.png)<br><br>

Here is the URL of this page:<br>
<span style="background-color: #66ff66;">https://&#8203;www&#8203;.linkedin.com/jobs/search/</span><span style="background-color: #ff6666;">?</span><span style="background-color: yellow;">currentJobId=3737393523</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">f_TPR=r2592000</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">f_WT=1%2C3%2C2</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">geoId=103977389</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">keywords=data%20analyst</span>
<span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">location=Washington%2C%20United%20States</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">origin=JOB_SEARCH_PAGE_JOB_FILTER</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">refresh=true</span><span style="background-color: #ff6666;">&</span><span style="background-color: yellow;">start=25</span>







In [40]:
test = SetFilter('Data Analyst', location='Washington State', day=30, job_num=0)
test.create_url()

'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?origin=JOB_SEARCH_PAGE_JOB_FILTER&refresh=true&position=1&pageNum=0&keywords=Data%20Analyst&location=Washington%20State&f_TPR=r2592000'

In [2]:
def get_states():
    states = [
        "California", "Texas", "New York", "Washington",
        "Alabama", "Arizona", "Arkansas", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", 
        "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", 
        "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", 
        "Nevada", "New Hampshire", "New Jersey", "New Mexico", "North Carolina", "North Dakota", "Ohio", 
        "Oklahoma", "Oregon", "Pennsylvania", "South Carolina", "South Dakota", "Tennessee", "Utah", 
        "Vermont", "Virginia", "West Virginia", "Wisconsin", "Wyoming", "District of Columbia"]
    
    return states

In [39]:
class SetFilter:
    
    def __init__(self, title=None, location=None, day=None, 
                 worktype=None, exp=None, job_num=None, early_applicant=False):
        
        assert (isinstance(day, int) and day > 0) or not day, 'day has to be an integer > 0 ()'
        assert (worktype in ['On-site', 'Remote', 'Hybrid']) or not worktype, "worktype is 'On-site', 'Remote', or 'Hybrid"
        assert (exp in ['Internship', 'Entry level', 'Associate', 'Mid-Senior level', 'Director']) or (not exp),\
                    "exp is 'Internship', 'Entry level', 'Associate', 'Mid-Senior level', or 'Director'"
        assert (isinstance(job_num, int) and 0 < job_num < 1000) or not job_num, 'job_num has to be an integer bewteen 0 and 999'
        assert isinstance(early_applicant, bool), 'early_applicant has to be boolean'
        
        self.title = title if title else None
        self.location = location if location else 'United States'
        self.day = day if day else None
        self.worktype = worktype if worktype else None
        self.exp = exp if exp else None
        self.job_num = job_num if job_num else None
        self.early_applicant = early_applicant

    
    def __call__(self):
        return self.__dict__
 
    
    def update(self, **kwargs):
        if 'title' in kwargs:
            assert kwargs['title'] is None or isinstance(kwargs['title'], str), 'title must be a string or None'
        if 'day' in kwargs:
            assert (isinstance(kwargs['day'], int) and kwargs['day'] > 0) or (not kwargs['day']), 'day has to be an integer > 0'
        if 'worktype' in kwargs:
            assert (kwargs['worktype'] in ['On-site', 'Remote', 'Hybrid']) or (not kwargs['worktype']),\
                    "worktype is 'On-site', 'Remote', or 'Hybrid'"
        if 'exp' in kwargs:
            assert (kwargs['exp'] in ['Internship', 'Entry level', 'Associate', 'Mid-Senior level', 'Director']) \
                    or (not exp), "exp is 'Internship', 'Entry level', 'Associate', 'Mid-Senior level', or 'Director'"
        if 'job_num' in kwargs:
            assert (isinstance(kwargs['job_num'], int) and 0 <= kwargs['job_num'] < 1000) or not kwargs['job_num'],\
                    'job_num has to be an integer between 0 and 999'
        if 'early_applicant' in kwargs:
            assert isinstance(kwargs['early_applicant'], bool), 'early_applicant has to be boolean'
        
        for key, value in kwargs.items():
            if hasattr(self, key):                
                setattr(self, key, value)
            else:
                raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'")
    
    
    def get_basic_url(self):
        ''' Return basic url for every url'''
        return 'origin=JOB_SEARCH_PAGE_JOB_FILTER&refresh=true&position=1&pageNum=0'
    
    
    def get_basic_webpage_url(self):
        ''' Return basic url for usual search page'''
        return 'https://www.linkedin.com/jobs/search?' + self.get_basic_url()      
    
    
    def get_basic_api_url(self):
        ''' Return basic url for backend API endpoint for loading more job postings'''
        return 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?' + self.get_basic_url()
    
    
    def get_title_url(self):
        ''' Return title url'''  
        title_header = '&keywords='
        return f"{title_header}{self.title.replace(' ', '%20')}" if self.title else ''
    
    
    def get_location_url(self):
        ''' Return district url'''
        location_header = '&location='
        return f"{location_header}{self.location.replace(' ', '%20')}"
    
    
    def get_day_url(self):
        ''' Return day url'''
        seconds_a_day = 86400
        day_header = '&f_TPR=r'
        return f'{day_header}{self.day*seconds_a_day}' if self.day else ''
    
    
    def get_all_worktype_options_url(self):
        ''' Return a dictionary of urls for all worktype options'''
        return {'On-site': '&f_WT=1',
                'Remote': '&f_WT=2',
                'Hybrid': '&f_WT=3'}
    
    
    def get_worktype_url(self):
        ''' Return worktype url'''
        options_url = self.get_all_worktype_options_url()        
        return options_url[self.worktype] if self.worktype else ''
        
    
    def get_all_exp_options_url(self):
        ''' Return a dictionary of urls for all experience level options'''
        return {'Internship': '&f_E=1',
                'Entry level': '&f_E=2',
                'Associate': '&f_E=3',
                'Mid-Senior level': '&f_E=4',
                'Director': '&f_E=5'}
    
    
    def get_exp_url(self):
        ''' Return experience level url'''
        options_url = self.get_all_exp_options_url()
        return options_url[self.exp] if self.exp else ''
         
        
    def get_job_num_url(self):
        ''' Return api page url'''
        job_num_header = '&start='
        return f'{job_num_header}{self.job_num}' if self.job_num else ''
    
    
    def get_early_applicant_url(self):
        return '&f_EA=true' if self.early_applicant else ''
        
    def create_url(self):    
        ''' Return API page url if job_num is set else get usual webpage url'''
        basic_url = self.get_basic_api_url() if self.job_num == None else self.get_basic_webpage_url()
        return basic_url + self.get_title_url() + self.get_location_url() + self.get_day_url() + \
               self.get_worktype_url() + self.get_exp_url() + self.get_job_num_url() + self.get_early_applicant_url()
    
    
    def get_filter_box_elements(self):
            ''' Return the dictionary of filter dropdown boxes' elements on webpage'''
            job_num = self.job_num
            self.job_num = None
            url = self.create_url()
            self.job_num = job_num

            # Try to get filter dropdown box elements
            for try_ in range(20):
                response = requests.get(url)
                html = BeautifulSoup(response.content, 'html.parser')  
                filter_boxes = {'day_element': [self.day, html.find(attrs={'aria-label': 'Date posted filter options'})],
                    'worktype_element': [self.worktype, html.find(attrs={'aria-label': 'Remote filter options'})],
                    'exp_element': [self.exp, html.find(attrs={'aria-label': 'Experience level filter options'})]}       
                if None not in filter_boxes['day_element']:
                    break
                if try_ == 19:
                    raise Exception('!!!!!!!!!!Filter Element Not Found!!!!!!!!!!')
                time.sleep(random.randint(10,20))
            return filter_boxes

    
    def get_job_counts(self):
        # Get filter dropdown box elements
        filter_boxes = self.get_filter_box_elements()
        # Get job counts in filter elements
        job_counts = {}   
        check_symbol = '\u2713'
        # Iterate all filter dropdown box elements
        for filter_name, filter_element in filter_boxes.items():
            job_counts[filter_name] = []            
            # Find the 'label' elements which include job counts information
            # !!! Website may change aria-label names, go to filter_boxes in get_filter_box_elements()
            job_count_elements = filter_element[1].find_all('label') 
            # Iterate all options in the dropdown box element
            for option in job_count_elements:
                string = '  ' + option.text.strip()
                # Add a check symbol for current selected filter option
                is_filter_selected = option.find_previous_sibling().has_attr('checked')
                if is_filter_selected: 
                    string = check_symbol + string[1:]
                    # If the day is not in filter options, webpage will still show 'Any Time'
                    if filter_name == 'self.day' and self.day not in [1, 7, 30, None]:
                        string = string.replace('Any Time', f'Past {self.day} Days') 
                job_counts[filter_name].append(string) 
        return job_counts
    
    
    def show_job_counts(self):
        job_counts = self.get_job_counts()   
        # Find the maximum length among the lists
        max_length = max(map(len, job_counts.values()))
        # Pad the shorter lists with zero-space to make them equal in length
        job_counts_df = {key: values + [' '] * (max_length - len(values)) for key, values in job_counts.items()}
        # Calculate space needed between strings
        job_counts_df = pd.DataFrame(job_counts_df)    
        txt_len_df = job_counts_df.apply(lambda col: col.str.len())
        pad = 10
        space_df = txt_len_df.apply(lambda col: max(col)+pad-col)
        
        start = 20
        longest_row = txt_len_df.iloc[:, -1].argmax()
        longest_len = txt_len_df.iloc[longest_row, :].sum() + space_df.iloc[longest_row, :-1].sum()
        print('\n', ' '*(start-10), '-'*(longest_len+start))
        header = f'*****  {self.location} ({self.title})  *****'
        pre = len(header) // 2
        mid = start + txt_len_df.iloc[0,0] + space_df.iloc[0,0] + txt_len_df.iloc[0,1] // 2 + 1
        print(' '*(mid-pre), header, '\n')
        for idx, row in job_counts_df.iterrows():
            print(' '*20, end='')
            formatted_text = ''.join(f"{string}{' '*space}" for string, space in zip(row, space_df.iloc[idx]))
            print(formatted_text)
        print(' '*(start-10), '-'*(longest_len+start))
        return job_counts

In [4]:
def decide_scrape_scope(job_counts):    
    # [[day, worktype], [], [],...]
    filter_ = []
    day_jobs = []
    worktypes = ['On-site', 'Remote', 'Hybrid']
    for element in job_counts['day_element'][1:]:
        pattern = r'([\w\s]+)\(([0-9,]+)\)'
        day_jobs.append(int(re.search(pattern, element)[2].replace(',', '')))        


    if day_jobs[0] <= 1200:
        for worktype in worktypes:  
                filter_.append([worktype, 30])
    elif 1200 < day_jobs[0] <= 2000:
        for worktype in worktypes:
            for day in [7, 30]:
                filter_.append([worktype, day])
    elif 2000 < day_jobs[0] <= 4000:
        if day_jobs[1] <= 2000:
            for worktype in worktypes:
                for day in [3, 7, 14, 21, 30]:
                    filter_.append([worktype, day])
        else:
            for worktype in worktypes:
                for day in [3, 5, 7, 14, 30]:
                    filter_.append([worktype, day])
    elif 4000 < day_jobs[0] <= 6000:
        if day_jobs[1] <= 2000:
           for worktype in worktypes: 
                for day in [3, 7, 10, 14, 21, 30]:
                    filter_.append([worktype, day])
        else:
            for worktype in worktypes:
                for day in [3, 5, 7, 14, 21, 30]:
                    filter_.append([worktype, day])
    elif 6000 < day_jobs[0] <= 8000:
        if day_jobs[1] <= 2000:
            for worktype in worktypes:
                for day in [3, 7, 10, 15, 20, 25, 30]:
                    filter_.append([worktype, day])
        elif 1500 < day_jobs[1] <= 4000:
           for worktype in worktypes: 
                for day in [1, 3, 7, 10, 14, 21, 30]:
                    filter_.append([worktype, day])  
        else:
           for worktype in worktypes: 
                for day in [1, 3, 5, 7, 14, 21, 30]:
                    filter_.append([worktype, day])
    elif 8000 < day_jobs[0] <= 12000:
        if day_jobs[1] <= 2000:
           for worktype in worktypes: 
                for day in [3, 7, 10, 14, 18, 22, 26, 30]:
                    filter_.append([worktype, day])
        elif 2000 < day_jobs[1] <= 4000:
            for worktype in worktypes:
                for day in [1, 3, 7, 10, 15, 20, 25, 30]:
                    filter_.append([worktype, day])  
        elif 4000 < day_jobs[1] <= 8000:
           for worktype in worktypes: 
                for day in [1, 2, 3, 5, 7, 14, 21, 30]:
                    filter_.append([worktype, day]) 
        else:
           for worktype in worktypes: 
                for day in [1, 2, 3, 4, 5, 6, 7, 30]:
                    filter_.append([worktype, day]) 
    else:
        if day_jobs[1] <= 2000:
           for worktype in worktypes: 
                for day in [3, 7, 10, 15, 18, 21, 24, 27, 30]:
                    filter_.append([worktype, day])
        elif 2000 < day_jobs[1] <= 4000:
           for worktype in worktypes: 
                for day in [1, 3, 7, 10, 14, 18, 22, 26, 30]:
                    filter_.append([worktype, day])  
        elif 4000 < day_jobs[1] <= 8000:
         for worktype in worktypes:   
                for day in [1, 3, 5, 7, 10, 15, 20, 25, 30]:
                    filter_.append([worktype, day]) 
        else:
           for worktype in worktypes: 
                for day in [1, 2, 3, 4, 5, 6, 7, 10, 20, 30]:
                    filter_.append([worktype, day]) 

    return filter_        

In [5]:
def print_progress(message):
    sys.stdout.write("\r" + message)
    sys.stdout.flush()

def scrape_this_page(url, current_scraped=None, total_job=None):
    '''
    Scrape all jobs in this page given an url
    '''     
    # Try to get job card elements for current page, return None if none  
    for try_ in range(8):
        response = requests.get(url)
        html = BeautifulSoup(response.content, 'html.parser')  
        job_card_elements = html.find_all('div', class_='base-search-card--link')
        if job_card_elements:
            jobs_in_page = len(job_card_elements)
            break            
        elif try_ == 7:
            return None

        time.sleep(random.uniform(0.5,1.5))

    # Scrape the job information for each job
    df_page = pd.DataFrame() 
    col_name = ['title', 'company', 'location', 'posted_date']
    for job in job_card_elements:
        total_job += 1
        df_job = {}
        df_job['title'] = job.find('h3', class_='base-search-card__title').text.strip()
        df_job['company'] = job.find('h4', class_='base-search-card__subtitle').text.strip()
        location = job.find('span', class_='job-search-card__location')
        df_job['location'] = location.text.strip() if location else None        
        date_1 = job.find('time', class_='job-search-card__listdate')
        date_2 = job.find('time', class_='job-search-card__listdate--new')
        df_job['posted_date'] = date_1['datetime'] if date_1 else date_2['datetime']
        
     
        # If job listings already exist, skip
        if not df_page.empty:
            df_set = set(df_page[col_name].itertuples(index=False, name=None))
            if tuple(df_job.values()) in df_set:
                if current_scraped != None and total_job != None:
                    print_progress(progress_msg1 + f'{current_scraped}/{total_job}  (Saved / Total Jobs)')
                continue
        if os.path.exists(csv_path):
            scraped_df = pd.read_csv(csv_path, usecols=col_name, on_bad_lines='skip')
            df_set = set(scraped_df.itertuples(index=False, name=None))
            if tuple(df_job.values()) in df_set:
                if current_scraped != None and total_job != None:    
                    print_progress(progress_msg1 + f'{current_scraped}/{total_job}  (Saved / Total Jobs)')
                continue

                
        df_job['worktype'] = worktype if worktype else None
        salary = job.find('span', class_='job-search-card__salary-info')
        df_job['salary'] = salary.text.strip() if salary else None
        df_job['job_link'] = job.find('a', class_='base-card__full-link')['href']  
        
        
        
        # Go inside the job link and scrape job description and other information
        # Discard all information about the job if job description is empty
        skip = False
        for try_ in range(5):
            job_response = requests.get(df_job['job_link'])
            job_html = BeautifulSoup(job_response.content, 'html.parser')
            job_description = job_html.find('div', class_='show-more-less-html__markup')                
            if job_description:
                break
            if try_ == 4:
                skip = True
                break
            
            time.sleep(random.uniform(1,3))                          
        if skip:
            if current_scraped != None and total_job != None:
                print_progress(progress_msg1 + f'{current_scraped}/{total_job}  (Saved / Total Jobs)')
            continue
        df_job['job_description'] = job_description.text
                               
        applicants_1 = job_html.find('figcaption', class_='num-applicants__caption')
        applicants_2 = job_html.find('span', class_='num-applicants__caption')
        df_job['applicants'] = applicants_1.text if applicants_1 else applicants_2.text

        criteria = job_html.find_all('li', class_='description__job-criteria-item')
        for criterion in criteria:
            feature = criterion.find('h3', class_='description__job-criteria-subheader').text.strip()
            value = criterion.find('span', class_='description__job-criteria-text--criteria').text.strip()
            df_job[feature] = value

        df_job = pd.DataFrame([df_job]) 
        df_page = df_job if df_page.empty else pd.concat([df_page, df_job], ignore_index=True)  
        current_scraped += 1
        if current_scraped != None and total_job != None:
            print_progress(progress_msg1 + f'{current_scraped}/{total_job}  (Saved / Total Jobs)')
    
    return current_scraped, total_job, df_page

In [6]:
state = 'California'
worktype = 'On-site'
day = 1
def get_start_filter(state=state, worktype=worktype, day=day):
    state_index = states.index(state)
    worktype_index = ['On-site', 'Remote', 'Hybrid'].index(worktype)
    return state_index, worktype_index, day

In [7]:
# Start filter ---------------------------------------------------
title = 'Business Analyst' 
states = get_states()
start_filter = get_start_filter('California', 'On-site', 1)
# Start filter ---------------------------------------------------



# Create a dataset folder if it doesn't exist
if not os.path.exists('datasets'):
    os.makedirs('datasets')
# Create a title folder if it doesn't exist
title_folder = title.lower().replace(' ', '_')
if not os.path.exists(f'datasets/{title_folder}'):
    os.makedirs(f'datasets/{title_folder}')
# Create a JSON file that record job counts for each state
json_path = f'datasets/{title_folder}/job_counts.json'
if not os.path.exists(json_path):
    with open(json_path, 'w') as  json_file:
        json.dump({'california': None}, json_file, indent=4)
    
# Iterate each state
for state in states:
    
    # Start from a specific state -----------------------------------------------
    if states.index(state) < start_filter[0]:
        continue
    # Skip used states -----------------------------------------------

    
    # Get job counts for this state
    location = state
    if state != 'District of Columbia':
        location = state + ' State'
    job_filter = SetFilter(title=title, location=location, day=30)
    job_counts = job_filter.show_job_counts()
    
    # Set the path of csv file         
    csv_path = f"datasets/{title_folder}/{state.lower().replace(' ', '_')}.csv"
    print(f"\nData Will Save to '{csv_path}'")
    
    # Update job counts data in the JSON file
    with open(json_path, 'r') as json_file:
        job_counts_json = json.load(json_file)
    job_counts_json.update({state.lower().replace(' ', '_'): job_counts})
    with open(json_path, 'w') as json_file:
        json.dump(job_counts_json, json_file, indent=4)
    
    filter_iter = decide_scrape_scope(job_counts)
    
    # Iterate each filter set
    for worktype, day in filter_iter:        
        
        # Skip filters already used--------------------------------------------------------
        if states.index(state) < start_filter[0]:
            continue
        if ['On-site', 'Remote', 'Hybrid'].index(worktype) < start_filter[1]:
            continue
        if day < start_filter[2]:
            continue
        # Skip filters already used--------------------------------------------------------
        
        
        # Print Progress Message
        progress_msg1 = f"#####   |  Worktype: {worktype if worktype else 'All'}, Day: {day}  |   #####     "
        print_progress(progress_msg1)
        # Iterate each page for this filter to scrape jobs
        total_job = current_scraped = empty_page = 0        
        while total_job < 1000:
            # Navigate to different api pages and scraped job listings by adjusting job_num, maximum 1000
            job_filter.update(day=day, worktype=worktype, job_num=total_job)
            url = job_filter.create_url()
            result = scrape_this_page(url, current_scraped, total_job)
            
            if result != None:
                empty_page = 0
                current_scraped, total_job, df_page = result
            else:
                if empty_page > 3:
                    break
                empty_page += 1
                # Print Progress Message
                print_progress(progress_msg1 + f'{current_scraped}/{total_job+10}  (Saved / Total Jobs)')
                total_job += 10
                continue
                

            # Create a new csv file if it doesn't exist, else add new data to the file
            if not os.path.exists(csv_path):               
                df_page.to_csv(f'{csv_path}', index=False)           
            else:
                if df_page is None:
                    continue
                elif df_page.empty:
                    continue
                else:
                    df_page.to_csv(f'{csv_path}', mode='a', header=False, index=False)           
            
        print()


            ----------------------------------------------------------------------------------------------
                                 *****  California State (Business Analyst)  ***** 

                      Any time (13)                On-site (1)            Mid-Senior level (1)          
                    ✓ Past month (2)               Hybrid (1)                                           
                      Past week (1)                                                                     
                      Past 24 hours (1)                                                                 
           ----------------------------------------------------------------------------------------------

Data Will Save to 'datasets/business_analyst/california.csv'
#####   |  Worktype: On-site, Day: 30  |   #####     1/41  (Saved / Total Jobs)
#####   |  Worktype: Remote, Day: 30  |   #####     0/40  (Saved / Total Jobs)
#####   |  Worktype: Hybrid, Day: 30  |   #####     1/41  (S

KeyboardInterrupt: 

## Check the Scraped Jobs and Scraped again if Needed

In [17]:
def extract_job_count(job_series):
    job_series = job_series.str.replace(',', '')
    job_df = job_series.str.extract(r'\((?P<' + job_series.name + r'>[\d,]+)\)')
    return job_df

In [19]:
def extract_day_count(day_list):
    day_counts = str(day_list).replace(',', '')
    past_month = re.search(r'Past month \((\d+)\)', day_counts)
    past_month = str(0) if not past_month else past_month[1]
    past_week = re.search(r'Past week \((\d+)\)', day_counts)
    past_week = str(0) if not past_week else past_week[1]
    past_day = re.search(r'Past 24 hours \((\d+)\)', day_counts) 
    past_day = str(0) if not past_day else past_day[1]
    
    return f' {past_day} - {past_week} - {past_month} '

In [20]:
# ------------------------------------------------------------------------------- 
title = 'Data Analyst'
print(title)
title = title.lower().replace(' ', '_')
# ------------------------------------------------------------------------------- 


json_path = f'datasets/{title}/job_counts.json'
# Get target scraping jobs from json
df = pd.read_json(json_path).T.sort_index()
df_total = df['day_element'].apply(lambda lst: lst[1])
df_total = extract_job_count(df_total)
df_day = df['day_element'].apply(lambda lst: extract_day_count(lst[1:]))
df_worktype = df['worktype_element'].explode()
df_worktype = extract_job_count(df_worktype)
df_worktype = df_worktype.groupby(df_worktype.index)['worktype_element'].apply(lambda x: pd.Series(x.values)).unstack()
df_target = pd.concat([df_total, df_worktype], axis=1)
df_target.columns = ['total_jobs', 'onsite', 'hybrid', 'remote']

# Get already scraped jobs from the csv file
df_scraped = pd.DataFrame()
for csv_path in os.listdir(f'datasets/{title}'):
    if csv_path == 'job_counts.json':
        continue
    df = pd.read_csv(f'datasets/{title}/{csv_path}', usecols=['worktype'])
    df_part = df.value_counts().to_frame()
    df_part.index.names = [None]
    flat_index = df_part.index.get_level_values(0)
    flat_index = pd.Index([index.lower().replace('-', '') for index in flat_index])
    df_part = df_part.set_index(flat_index).T
    df_part.set_index(pd.Index([csv_path[:-4]]), inplace=True)
    if df_part.empty:
        total = len(df)
    else:
        total = df_part.agg('sum', axis=1)[0]
    
    df_part['total_jobs'] = total 
    df_part = df_part.astype(str)
    df_scraped = df_part if df_scraped.empty else pd.concat([df_scraped, df_part])
    
df_mid = df_scraped.copy()
df_mid[:] = ' / '
df_result = df_scraped + df_mid + df_target
df_result['total_jobs'] = df_result['total_jobs'] + ' @ (' + df_day + ')'
df_result

Data Analyst


  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]
  total = df_part.agg('sum', axis=1)[0]


Unnamed: 0,hybrid,onsite,remote,total_jobs
alabama,144 / 274,442 / 590,48 / 100,634 / 987 @ ( 118 - 373 - 987 )
arizona,572 / 529,1085 / 1184,386 / 367,2043 / 2058 @ ( 229 - 718 - 2058 )
arkansas,,105 / 368,,105 / 544 @ ( 78 - 186 - 544 )
california,4037 / 3089,10956 / 8961,2278 / 1877,17271 / 14257 @ ( 840 - 3937 - 14257 )
colorado,523 / 572,1535 / 1544,284 / 259,2342 / 2432 @ ( 195 - 778 - 2432 )
connecticut,224 / 270,772 / 784,135 / 131,1131 / 1178 @ ( 138 - 554 - 1178 )
delaware,45 / 93,116 / 757,15 / 62,176 / 905 @ ( 138 - 647 - 905 )
district_of_columbia,511 / 556,2147 / 1704,273 / 286,2931 / 2588 @ ( 267 - 1169 - 2588 )
florida,1369 / 1132,3149 / 2651,638 / 597,5156 / 4324 @ ( 120 - 1586 - 4324 )
georgia,1292 / 1214,3024 / 3038,424 / 480,4740 / 4657 @ ( 105 - 2109 - 4657 )


In [35]:
def get_only_filter():
    return [
            # ('Alabama', 7, 'On-site'), 
            # ('Arizona', 7, 'On-site'), ('Arizona', 7, 'Remote'),
            # ('Arkansas', 30, 'On-site'),
            # ('California', 1, 'On-site'), ('California', 2, 'On-site'), ('California', 3, 'On-site'),
            # ('California', 4, 'On-site'), ('California', 5, 'On-site'), ('California', 6, 'On-site'),
            # ('California', 7, 'On-site'), #('California', 7, 'On-site'), ('California', 7, 'On-site'),
            # ('California', 14, 'On-site'), ('California', 21, 'On-site'), ('California', 28, 'On-site'),
            # ('California', 30, 'On-site'), ('California', 30, 'On-site'), ('California', 30, 'On-site'),
            # ('California', 7, 'Hybrid'), ('California', 7, 'Remote'),
            # ('Colorado', 7, 'On-site'), ('Colorado', 7, 'Remote'), ('Colorado', 7, 'Hybrid'),
            # ('Connecticut', 30, 'On-site'), ('Delaware', 30, 'On-site'),
            # ('District of Columbia', 7, 'On-site'), #('District of Columbia', 30, 'On-site'),
            # ('District of Columbia', 7, 'Hybrid'), ('District of Columbia', 30, 'Remote'),
            # ('Florida', 7, 'On-site'), ('Florida', 7, 'Hybrid'), #('Florida', 21, 'On-site'),
            # ('Georgia', 7, 'On-site'), #('Georgia', 7, 'On-site'), #('Georgia', 21, 'On-site'),
            # ('Georgia', 7, 'Remote'), ('Georgia', 7, 'Hybrid'), 
            # ('Idaho', 7, 'Remote'), ('Idaho', 7, 'On-site'), 
            # ('Illinois', 1, 'On-site'), ('Illinois', 3, 'On-site'), ('Illinois', 7, 'On-site'),
            # ('Indiana', 7, 'On-site'), ('Iowa', 7, 'On-site'), 
            # ('Kansas', 7, 'Hybrid'),
            # ('Kentucky', 7, 'On-site'), ('Kentucky', 7, 'Hybrid'), ('Kentucky', 7, 'Remote'),
            # ('Louisiana', 7, 'On-site'), ('Louisiana', 7, 'Hybrid'), ('Maryland', 7, 'Hybrid'),
            # ('Maryland', 1, 'On-site'), ('Maryland', 2, 'On-site'), ('Maryland', 3, 'On-site'),
            # ('Maryland', 4, 'On-site'), ('Maryland', 5, 'On-site'), ('Maryland', 7, 'On-site'),
            # ('Maryland', 14, 'On-site'), ('Maryland', 21, 'On-site'), ('Maryland', 30, 'On-site'),
            # ('Massachusetts', 7, 'On-site'), ('Massachusetts', 3, 'On-site'), ('Massachusetts', 7, 'On-site'), 
            # ('Massachusetts', 7, 'Hybrid'), #('Massachusetts', 14, 'Hybrid'),
            # ('Michigan', 7, 'On-site'), ('Michigan', 7, 'Hybrid'), #('Michigan', 14, 'On-site'), 
            # ('Michigan', 7, 'Remote'), 
            # ('Minnesota', 7, 'Remote'), #('Minnesota', 7, 'Hybrid'), ('Mississippi', 7, 'On-site'),
            # ('Missouri', 7, 'Hybrid'), #('Missouri', 30, 'Hybrid'), ('Nebraska', 30, 'On-site'),
            # ('Nevada', 7, 'On-site'), #('New Hampshire', 7, 'On-site'),
            # ('New Jersey', 7, 'On-site'), ('New Jersey', 14, 'On-site'), ('New Jersey', 21, 'On-site'),
            # ('New Jersey', 7, 'Hybrid'),    
            # ('New Mexico', 7, 'On-site'), ('New Mexico', 7, 'Hybrid'), 
            # ('New York', 1, 'On-site'), ('New York', 2, 'On-site'), ('New York', 3, 'On-site'),
            # ('New York', 5, 'On-site'), ('New York', 7, 'On-site'), ('New York', 14, 'On-site'),
            # ('New York', 21, 'On-site'), ('New York', 30, 'On-site'), #('New York', 30, 'On-site'),
            # ('New York', 7, 'Remote'), #('New York', 14, 'Hybrid'),        
            # ('North Carolina', 7, 'On-site'), #('North Carolina', 14, 'On-site'), ('North Carolina', 21, 'On-site'),
            # ('North Carolina', 7, 'Hybrid'), ('North Carolina', 7, 'Remote'),
            # ('North Dakota', 7, 'On-site'), ('North Dakota', 7, 'Hybrid'), ('North Dakota', 7, 'Remote'),
            # ('Ohio', 7, 'On-site'), #('Ohio', 14, 'On-site'),('Ohio', 21, 'On-site'),
            # ('Ohio', 7, 'Hybrid'), ('Ohio', 14, 'Remote'),
            # ('Oklahoma', 30, 'On-site'), ('Oklahoma', 30, 'Remote'),
            # ('Oklahoma', 7, 'Hybrid'),
            # ('Oregon', 7, 'On-site'), #('Oregon', 30, 'Remote'),
            # ('Pennsylvania', 1, 'On-site'), ('Pennsylvania', 7, 'On-site'), ('Pennsylvania', 21, 'On-site'),
            # ('Pennsylvania', 7, 'Hybrid'), ('Pennsylvania', 7, 'Remote'),
            # ('South Carolina', 7, 'Remote'), #('South Carolina', 7, 'On-site'), ('South Carolina', 30, 'Hybrid'),
            # ('South Dakota', 7, 'On-site'), ('South Dakota', 7, 'Hybrid'),
            # ('Tennessee', 7, 'On-site'), ('Tennessee', 7, 'Hybrid'), #('Tennessee', 30, 'Remote'),
            # ('Texas', 1, 'On-site'), ('Texas', 2, 'On-site'), ('Texas', 3, 'On-site'),
            # ('Texas', 4, 'On-site'), ('Texas', 5, 'On-site'), ('Texas', 6, 'On-site'),
            # ('Texas', 7, 'On-site'), ('Texas', 14, 'On-site'), ('Texas', 21, 'On-site'),
            # ('Texas', 30, 'On-site'), #('Texas', 30, 'On-site'), ('Texas', 30, 'On-site'),
            # ('Texas', 7, 'Hybrid'), ('Texas', 7, 'Remote'), ('Texas', 21, 'Hybrid'),
            # ('Utah', 7, 'Remote'), #('Utah', 30, 'Hybrid'),
            # ('Virginia', 1, 'On-site'), ('Virginia', 2, 'On-site'), ('Virginia', 3, 'On-site'),
            # ('Virginia', 4, 'On-site'), ('Virginia', 5, 'On-site'), ('Virginia', 6, 'On-site'), 
            # ('Virginia', 7, 'On-site'), ('Virginia', 14, 'On-site'), ('Virginia', 21, 'On-site'),
            # ('Virginia', 30, 'On-site'), #('Virginia', 30, 'On-site'), ('Virginia', 30, 'On-site'),
            # ('Virginia', 7, 'Hybrid'), ('Virginia', 14, 'Hybrid'), ('Virginia', 14, 'Remote'),
            ('Washington', 1, 'On-site'), ('Washington', 3, 'On-site'), ('Washington', 7, 'On-site'),
            ('Washington', 10, 'On-site'), ('Washington', 12, 'On-site'), ('Washington', 14, 'On-site'),
            # ('Washington', 21, 'On-site'), ('Washington', 30, 'On-site'), ('Washington', 30, 'On-site')
            # ('Washington', 7, 'Hybrid'), ('Washington', 7, 'Remote'), #('Washington', 14, 'Remote'),
            # ('West Virginia', 7, 'On-site'), ('West Virginia', 7, 'Hybrid'), ('West Virginia', 7, 'Remote')
            # ('Wisconsin', 7, 'On-site') ('Wisconsin', 30, 'Hybrid'), ('Wisconsin', 30, 'Remote'),
            ]

In [36]:
# Start filter ---------------------------------------------------
only_filter = get_only_filter()
# Start filter ---------------------------------------------------

    
# Iterate each filter
last_state = None
for state, day, worktype in only_filter:
    csv_path = f"datasets/{title}/{state.lower().replace(' ', '_')}.csv"
    # Get job counts for this state
    location = state
    if state != 'District of Columbia':
        location = state + ' State'
    job_filter = SetFilter(title=title.replace('_', ' ').title(), location=location, day=30)
    if state == last_state:
        pass
    else:
        job_counts = job_filter.show_job_counts()
        print(f'\nSave Data to {csv_path}')
        last_state = state
    
    # Print Progress Message
    progress_msg1 = f"#####   |Day: {day}, Worktype: {worktype if worktype else 'All'}|   #####     "
    print_progress(progress_msg1)
        

    # Iterate each page for this filter to scrape jobs
    total_job = current_scraped = empty_page = 0        
    while total_job < 1000:
        # Navigate to different api pages and scraped job listings by adjusting job_num, maximum 1000
        job_filter.update(day=day, worktype=worktype, job_num=total_job)
        url = job_filter.create_url()
        result = scrape_this_page(url, current_scraped, total_job)

        if result != None:
            empty_page = 0
            current_scraped, total_job, df_page = result
        else:
            if empty_page > 3:
                break
            empty_page += 1
            # Print Progress Message
            print_progress(progress_msg1 + f'{current_scraped}/{total_job+10}  (Saved / Total Jobs)')
            total_job += 10
            continue
            
        # Create a new csv file if it doesn't exist, else add new data to the file
        if not os.path.exists(csv_path):               
            df_page.to_csv(f'{csv_path}', index=False)           
        else:
            if df_page is None:
                continue
            elif df_page.empty:
                continue
            else:
                df_page.to_csv(f'{csv_path}', mode='a', header=False, index=False)           

    print()


            --------------------------------------------------------------------------------------------------------
                                       *****  Washington State (Data Analyst)  ***** 

                      Any time (9,449)               On-site (3,512)            Internship (37)                   
                    ✓ Past month (4,943)             Hybrid (788)               Entry level (575)                 
                      Past week (1,832)              Remote (651)               Associate (91)                    
                      Past 24 hours (405)                                       Mid-Senior level (2,731)          
                                                                                Director (18)                     
           --------------------------------------------------------------------------------------------------------

Save Data to datasets/data_analyst/washington.csv
#####   |Day: 1, Worktype: On-site|   #####     38/32

In [318]:
for csv_path in os.listdir('datasets'):
    
    df_wrong = pd.read_csv(f'datasets/{csv_path}')
    is_wrong_row = df_wrong['posted_date'].str.contains(r'On-site|Hybrid|Remote')
    if is_wrong_row.sum() > 0:    
        df_wrong.loc[is_wrong_row, 'worktype'], df_wrong.loc[is_wrong_row, 'posted_date'] = \
            df_wrong.loc[is_wrong_row, 'posted_date'], df_wrong.loc[is_wrong_row, 'worktype']

    cols = list(df_wrong.columns)
    worktype_index = cols.index('worktype')
    date_index = cols.index('posted_date')
    cols[worktype_index], cols[date_index] = cols[date_index], cols[worktype_index]
    df_fix = df_wrong[cols]
    df_fix.drop_duplicates(subset=['title', 'company', 'location', 'posted_date'], keep='last', inplace=True)
    df_fix.to_csv(f'datasets/{csv_path}', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fix.drop_duplicates(subset=['title', 'company', 'location', 'posted_date'], keep='last', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fix.drop_duplicates(subset=['title', 'company', 'location', 'posted_date'], keep='last', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fix.drop_duplicates(subset=['title', 'company', 'location', 'posted_date'], keep='last', inplace=True)
A value is trying to be set on a copy of a slice from a Da

# Test

In [59]:
def scrape_this_page(url):
    '''
    Scrape all jobs in this page given an url
    '''     
    # Try to get job card elements for current page, return None if none  
    for try_ in range(5):
        response = requests.get(url)
        html = BeautifulSoup(response.content, 'html.parser')  
        job_card_elements = html.find_all('div', class_='base-search-card--link')
        if job_card_elements:
            jobs_in_page = len(job_card_elements)
            break            
        elif try_ == 4:
            print(f'  *** EMPTY PAGE ***', end='')
            return 0, 0, None
        if try_ < 3:
            time.sleep(random.randint(3,9))
        else:
            time.sleep(random.randint(15,30))

    # Scrape the job information for each job
    df_page = pd.DataFrame()
    job_scraped = 0
    for job in job_card_elements:
        df_job = {}
        df_job['title'] = job.find('h3', class_='base-search-card__title').text.strip()
        df_job['company'] = job.find('h4', class_='base-search-card__subtitle').text.strip()
        df_job['location'] = job.find('span', class_='job-search-card__location').text.strip()
        df_job['worktype'] = worktype if worktype else None
        date_1 = job.find('time', class_='job-search-card__listdate')
        date_2 = job.find('time', class_='job-search-card__listdate--new')
        df_job['posted_date'] = date_1['datetime'] if date_1 else date_2['datetime']            
        salary = job.find('span', class_='job-search-card__salary-info')
        df_job['salary'] = salary.text.strip() if salary else None
        df_job['job_link'] = job.find('a', class_='base-card__full-link')['href']            

        # Go inside the job link and scrape job description and other information
        # Discard all information about the job if job description is empty
        skip = False
        for try_ in range(5):
            job_response = requests.get(df_job['job_link'])
            job_html = BeautifulSoup(job_response.content, 'html.parser')
            job_description = job_html.find('div', class_='show-more-less-html__markup')                
            if job_description:
                break
            if try_ == 4:
                skip = True
                break
            time.sleep(random.randint(1,5))                               
        if skip:
            continue
        df_job['job_description'] = job_description.text
                               
        applicants_1 = job_html.find('figcaption', class_='num-applicants__caption')
        applicants_2 = job_html.find('span', class_='num-applicants__caption')
        df_job['applicants'] = applicants_1.text if applicants_1 else applicants_2.text

        criteria = job_html.find_all('li', class_='description__job-criteria-item')
        for criterion in criteria:
            feature = criterion.find('h3', class_='description__job-criteria-subheader').text.strip()
            value = criterion.find('span', class_='description__job-criteria-text--criteria').text.strip()
            df_job[feature] = value

        df_job = pd.DataFrame([df_job]) 
        df_page = df_job if df_page.empty else pd.concat([df_page, df_job], ignore_index=True)  
        job_scraped += 1
        print('.', end='')
    
    return job_scraped, jobs_in_page, df_page

In [18]:
title = 'Data Analyst' 
states = get_states()

# Get used filters ---------------------------------------------------
used_filter = get_used_filter()
# Get used filters ---------------------------------------------------

# Create a dataset file if it doesn't exist
dataset_folder = 'datasets'
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)
    
# Iterate each state
for state in states:
    
    # Skip used states -----------------------------------------------
    if state in used_filter:
        continue
    # Skip used states -----------------------------------------------
    
    # Create a State file if it doesn't exist
    state_folder = state.lower().replace(' ', '_')
    if not os.path.exists(f'{dataset_folder}/{state_folder}'):
        os.makedirs(f'{dataset_folder}/{state_folder}')
    
    location = state
    if state != 'District of Columbia':
        location = state + ' State'
    job_filter = SetFilter(title=title, location=location, day=30)
    job_counts = job_filter.show_job_counts()
    filter_iter = decide_scrape_scope(job_counts)
    
    # Iterate each filter set
    for day, worktype in filter_iter:
        
        # Skip filters already used--------------------------------------------------------
        if [day, worktype] in used_filter:
            continue
        # Skip filters already used--------------------------------------------------------
        
        print(f"\n#####   |Day: {day}, Worktype: {worktype if worktype else 'All'}|   #####     ", end='')
        # Set the path of csv file         
        string = f"{state.lower()} {day}d {worktype.lower().replace('-', '')}".strip().replace(' ', '_')
        csv_file = f'{string}.csv'
        csv_path = f'{dataset_folder}/{state_folder}/{csv_file}'
        
        # Iterate each page for this filter to scrape jobs
        total_job = current_scraped = empty_page = 0        
        while total_job < 1000:
            # Navigate to different api pages and scraped job listings by adjusting job_num, maximum 1000
            job_filter.update(day=day, worktype=worktype, job_num=total_job)
            url = job_filter.create_url()
            result = scrape_this_page(url)
            
            if result != (0, 0, None):
                empty_page = 0
                job_scraped, jobs_in_page, df_page = result
            else:
                if empty_page > 3:
                    break
                empty_page += 1
                total_job += 5
                print(f'{current_scraped}/{total_job}+10 ', end='')
                continue
                
            current_scraped += job_scraped
            total_job += jobs_in_page
            # Create a new csv file if it doesn't exist, else add new data to the file
            if not os.path.exists(csv_path):               
                df_page.to_csv(f'{csv_path}', index=False)           
            else:
                df_page.to_csv(f'{csv_path}', mode='a', header=False, index=False)           
            
            print(f'{current_scraped}/{total_job} ', end='')


            --------------------------------------------------------------------------------------------------------
                                              *****  Washington State  ***** 

                      Any time (9,004)               On-site (4,299)            Internship (38)                   
                    ✓ Past month (5,999)             Hybrid (911)               Entry level (691)                 
                      Past week (1,693)              Remote (823)               Associate (155)                   
                      Past 24 hours (333)                                       Mid-Senior level (3,432)          
                                                                                Director (38)                     
           --------------------------------------------------------------------------------------------------------

#####   |Day: 7, Worktype: On-site|   #####     ..................................................50/60 .......

KeyboardInterrupt: 

In [27]:
title = 'Data Analyst' 
states = get_states()

# Get used filters ---------------------------------------------------
used_filter = get_used_filter()
# Get used filters ---------------------------------------------------

# Create a dataset file if it doesn't exist
dataset_folder = 'datasets'
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)
    
# Iterate each state
for state in states:
    
    # Skip used states -----------------------------------------------
    if state in used_filter:
        continue
    # Skip used states -----------------------------------------------
    
    # Create a State file if it doesn't exist
    state_folder = state.lower().replace(' ', '_')
    if not os.path.exists(f'{dataset_folder}/{state_folder}'):
        os.makedirs(f'{dataset_folder}/{state_folder}')
    
    location = state
    if state != 'District of Columbia':
        location = state + ' State'
    job_filter = SetFilter(title=title, location=location, day=30)
    job_counts = job_filter.show_job_counts()
    filter_iter = decide_scrape_scope(job_counts)
    
    # Iterate each filter set
    for day, worktype in filter_iter:
        
        # Skip filters already used--------------------------------------------------------
        if [day, worktype] in used_filter:
            continue
        # Skip filters already used--------------------------------------------------------
        
        print(f"\n#####   |Day: {day}, Worktype: {worktype if worktype else 'All'}|   #####     ", end='')
        # Set the path of csv file         
        string = f"{state.lower()} {day}d {worktype.lower().replace('-', '')}".strip().replace(' ', '_')
        csv_file = f'{string}.csv'
        csv_path = f'{dataset_folder}/{state_folder}/{csv_file}'
        
        # Iterate each page for this filter to scrape jobs
        total_job = current_scraped = empty_page = 0        
        while total_job < 1000:
            # Navigate to different api pages and scraped job listings by adjusting job_num, maximum 1000
            job_filter.update(day=day, worktype=worktype, job_num=total_job)
            url = job_filter.create_url()
            result = scrape_this_page(url)
            
            if result != (0, 0, None):
                empty_page = 0
                job_scraped, jobs_in_page, df_page = result
            else:
                if empty_page > 3:
                    break
                empty_page += 1
                print(f'{current_scraped}/{total_job}+5 ', end='')
                total_job += 5
                continue
                
            current_scraped += job_scraped
            total_job += jobs_in_page
            # Create a new csv file if it doesn't exist, else add new data to the file
            if not os.path.exists(csv_path):               
                df_page.to_csv(f'{csv_path}', index=False)           
            else:
                df_page.to_csv(f'{csv_path}', mode='a', header=False, index=False)           
            
            print(f'{current_scraped}/{total_job} ', end='')


            --------------------------------------------------------------------------------------------------------
                                              *****  Washington State  ***** 

                      Any time (8,979)               On-site (4,304)            Internship (37)                   
                    ✓ Past month (5,952)             Hybrid (899)               Entry level (690)                 
                      Past week (1,649)              Remote (816)               Associate (148)                   
                      Past 24 hours (290)                                       Mid-Senior level (3,412)          
                                                                                Director (38)                     
           --------------------------------------------------------------------------------------------------------

#####   |Day: 7, Worktype: On-site|   #####     ......................................................54/60 ...

KeyboardInterrupt: 

In [22]:
import torch

In [24]:
torch.cuda.is_available()

True