In [166]:
import datetime as dt
import pandas as pd

import nltk
import re
import requests
import string

from bs4 import BeautifulSoup, Comment, NavigableString, SoupStrainer
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 1) (optional) Scrape 100 Job Listings that contain the title "Data Scientist" from indeed.com

At a minimum your final dataframe of job listings should contain
- Job Title
- Job Description

In [227]:
class IndeedScraper:
    """ Class meant for scraping Indeed.com for specified job keywords and job locations"""

    def query_generator(self, keyword, location):
        """A function that takes a search keyword (or keywords) and a city and returns the resulting query url"""

        string1 = f"https://www.indeed.com/jobs?as_and={keyword}&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&"
        string2 = f"sr=directhire&as_src=&salary=&radius=50&l={location}&fromage=any&sort=&psf=advsrch"
        query = string1 + string2

        return query

    def search_results_numeric(self, query):
        '''A function that takes a search query url and returns the number of search results'''

        search_count = SoupStrainer(id="searchCount")
        page = requests.get(query)
        soup = BeautifulSoup(page.text, "html.parser", parse_only=search_count)
        s = soup.get_text()
        num_search_results = [int(item) for item in s.split(' ') if item.isdigit()][-1]
        result = "Number of job results:" + str(num_search_results)
        return num_search_results


    def search_page_generator(self, query, num_search_results, limit=50):
        '''A function that takes a query url and the number of search results corresponding that query, 
        and returns a list of urls to be scraped.'''

        urls_to_scrape = []
        converted_search_results = int(num_search_results)
        i = int(converted_search_results / 50)
        
        for page_number in range(i + 1):
            
            url_suffix = f'&limit={limit}&start={str(page_number * 50)}'
            url = f'{query}{url_suffix}'
            urls_to_scrape.append(url)
            
        return urls_to_scrape

    def make_soup(self, query, parser='html.parser'):

        '''A function that takes a query url and returns a BeautifulSoup object. html.parser is passed in as default parser'''

        page = requests.get(query)
        soup = BeautifulSoup(page.text, parser)

        return soup
    

    # Map iterables in urls_to_scrape into soup_generator()
    def extract_job_postings(self, url_list):
        """A function that takes a list of urls from search_page_generator() 
         and returns a list of BeautifulSoup objects corresponding to each job posting in the list of urls"""
    
        job_postings = []
        
        for url in url_list:
            
            soup = scraper.make_soup(url)
            for result in soup.find_all('div', attrs={'data-tn-component': 'organicJob'}):
                job_postings.append(result)
            
        return job_postings
    
    def process_job_posting(self, job):
        """
        Function that parses through html elements of indeed job postings and prints them in a prettified string output
        """
        
        # job id
        
        
        try:
            job_id = job.find('h2', attrs={"class": "jobtitle"})['id']
            #print("Job ID:", job_id)
        except AttributeError:
            job_id = 'NA'
        

        # job title
        
        job_title = job.find('a', attrs={'data-tn-element':"jobTitle"}).text.strip().capitalize()

        #print("Job Title:", job_title)

        # * company

        company = job.find('span', class_='company').text.strip()

        #print("Company:", company)

        # location

        location = job.find('span', class_='location').get_text()
        #print("Location:", location)
        
        # date_posting

        post_date = job.find('span', class_='date').get_text()
        #print("Date Posted:", post_date)
        
        
        # salary_range

        try:
            salary = job.find('span', class_='salary no-wrap').text.strip()
            #print("Salary:" , salary)
        except AttributeError:
            salary = 'NA'

        # job_summary

        summary = job.find('span', class_='summary').text.strip()
        #print("Job Summary:", summary)

        # job_link

        job_link = "https://www.indeed.com" + job.find('h2', attrs={"class": "jobtitle"}).find('a')['href']
        #print("Job_link:", job_link)
        
        # full description
        
        request = requests.get(job_link)
        request_soup = BeautifulSoup(request.text, "html.parser")
        description = request_soup.find('div', attrs={"class": "jobsearch-JobComponent-description"})
        description = description.text.strip()
        
        #print(description)
        return [job_id, job_title, company, location, post_date, salary, summary, job_link, description]
        



https://www.indeed.com/jobs?as_and=tensorflow&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&sr=directhire&as_src=&salary=&radius=50&l=21044&fromage=any&sort=&psf=advsrch
230
237


In [233]:
        
# =========================== EXAMPLE USAGE ==============================================
# init class object
scraper = IndeedScraper()
# define query with keyword/keywords and location
test_query = scraper.query_generator("tensorflow", "21044")
# store the amount of search results
num_search_results = scraper.search_results_numeric(test_query)
# create urls from query and associated number of jobs
urls = scraper.search_page_generator(test_query, num_search_results)

# you can skip this step, it's built into IndeedScraper.extract_job_postings(), just testing functionality
#soup = scraper.make_soup(test_query) 

# this opens all the links from the pages and stores their full text description for later use
job_postings = scraper.extract_job_postings(urls)

#============================================== debug ==============================================
#print(test_query)
#print(number_of_jobs)
#print(urls)
#print(soup)
#print(len(job_postings))
#============================================== debug ==============================================

# verifying all is working
test = scraper.process_job_posting(job_postings[0])
print(test)

['jl_51b0735e7d2e01d8', 'Geospatial data scientist', 'Chesapeake Conservancy Inc', 'Annapolis, MD 21401', '1 day ago', '$100,000 - $125,000 a year', 'Experience with CNTK, Tensorflow, Keras. Chesapeake Conservancy, a nonprofit organization based in Annapolis, Maryland is seeking a Geospatial Data Scientist...', 'https://www.indeed.com/rc/clk?jk=51b0735e7d2e01d8&fccid=51c576784f5d7c9d&vjs=3', '$100,000 - $125,000 a yearDescription:\nChesapeake Conservancy, a nonprofit organization based in Annapolis, Maryland is seeking a Geospatial Data Scientist with experience leveraging machine learning and large and complex data sets in the environmental world.\n\nABOUT THE CONSERVANCY\nChesapeake Conservancy is a non-profit organization based in Annapolis, Maryland, dedicated to ensuring a healthier Chesapeake Bay watershed where fish and wildlife thrive, with healthy waters and abundant forests, wetlands, shorelines, and open spaces. With the human population in the Chesapeake watershed approachi

# Move data to dataframe for processing

 This may take a bit to run depending on your query, try to narrow down what you want or you might be here a while.  
 If you get NA values, it's because i added in some ugly handling of getting past search limits with Indeed that stops the
 code from breaking

In [228]:
columns = ['ID', 'Title', 'Company', 'Location', 'Date', 'Salary', 'Summary', 'Link', 'FullText']
dataframe = pd.DataFrame(columns=columns)

In [229]:


for job_number in range(0, number_of_jobs - 1):
    dataframe.loc[len(dataframe)] = (scraper.process_job_posting(job_postings[job_number]))

In [232]:
dataframe.head()

Unnamed: 0,ID,Title,Company,Location,Date,Salary,Summary,Link,FullText
0,jl_51b0735e7d2e01d8,Geospatial data scientist,Chesapeake Conservancy Inc,"Annapolis, MD 21401",1 day ago,"$100,000 - $125,000 a year","Experience with CNTK, Tensorflow, Keras. Chesa...",https://www.indeed.com/rc/clk?jk=51b0735e7d2e0...,"$100,000 - $125,000 a yearDescription:\nChesap..."
1,jl_a276ea2539679009,Yolo developer,K&M Systems,"Tysons Corner, VA",14 hours ago,,"Job Description K&M Systems, Inc. is looking f...",https://www.indeed.com/rc/clk?jk=a276ea2539679...,"ContractJob Description\nK&M Systems, Inc. is ..."
2,jl_ffecc05983f8169b,Summer intern - big data & machine learning,Intelligent Automation,"Rockville, MD 20855",6 hours ago,,"Intelligent Automation, Inc. (IAI) is seeking ...",https://www.indeed.com/rc/clk?jk=ffecc05983f81...,"Temporary, InternshipIntelligent Automation, I..."
3,jl_f790b176479fd4e4,Senior python developer - data engineering and...,B23 LLC,"McLean, VA",30+ days ago,"$85,000 - $175,000 a year","Working knowledge of TensorFlow, TensorFLow Li...",https://www.indeed.com/company/B23/jobs/Senior...,"$85,000 - $175,000 a yearB23 is a software com..."
4,jl_52ad992137a033cf,Intern - artificial intelligence,Alion Science and Technology,"Annapolis Junction, MD 20701",30+ days ago,,Experience with deep learning frameworks and l...,https://www.indeed.com/rc/clk?jk=52ad992137a03...,"Part-time, Temporary, InternshipResponsibiliti..."


In [0]:
##### Your Code Here #####

# 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [0]:
##### Your Code Here #####

# 4) Visualize the most common word counts

In [0]:
##### Your Code Here #####

 # 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [0]:
##### Your Code Here #####

## Stretch Goals

 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.