# Web Scraping Indeed.com for Key Web Developer Job Skills - Phase II

## The question is - Do different job clusters require different subsets of web skills?

Which web development skills should an aspiring web developer learn to be hired in Palm Beach County, Florida?

## The client:

Palm Beach Code School (PBCS) in Palm Beach Gardens, FL. - palmbeachcodeschool.com .

## The data set:

The data set starts with a list of the target skills that are included in particular want ads scraped from Indeed.com. 

## Other potential data sets:

We could do a trend analysis by comparing results over several months or compare Palm Beach County's results with other Forida or US locations.

The process steps are as follows:
* Decide what specific job(s) to be searched - web developer or programmer to start.
* Make a list of all the skills - see the function below.
* Determine the geographical limits of the search - within 50 miles of West Palm Beach.
* Use k_Means to cluster the jobs.

To run the scraper, load the following libraries, define the two data extraction / cleaning functions; html_cleaner and web_skills_info (both below), .

In [1]:
from bs4 import BeautifulSoup as soup # For HTML parsing
from selenium import webdriver
import requests # Website connection
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
import pandas as pd # For converting results to a dataframe
%matplotlib inline

In [2]:
columns = ["city", "job_title", "company_name", "location", "salary", "website",
           "Agile", "Android", "Angular", "Bootstrap", "C++", "Cloud", "CSS", "Excel", "Fullstack", 
           "HTML", "iOS", "Java", "Javascript", "JQuery", "JSON", "Linux", "Mobile", "Python", 
           "MongoDB", "MySQL","Node", "NoSQL", "PHP", "Ruby", "SQL", "Windows", "WordPress", "XML"]

In [3]:
def html_tabulate(website, skills):
    '''
    This function cleans the raw html so it's parsible.
    Inputs: a URL to investigate
    Outputs: list of cleaned text
    '''
    #print(website)
    driver = webdriver.Firefox()
    driver.get(website)
    html = driver.page_source
    driver.quit()
    
    soup_obj = soup(html, "lxml")
    
    div = soup_obj.find('span',{'id':'job_summary'})
    #print('***after div***', div)
    
    desc_area = div
    #desc_area = div.stripped_strings
    
    #print(type(desc_area.text))
    #print(desc_area)
    
    text = desc_area.text # Get the text
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) 
    
    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8') #remove blank lines
    
    # Now clean out all of the unicode
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore')
    except:                                                            
        return                                                         
    
    text = re.sub("[^a-zA-Z+]"," ", str(text))  # Now get rid of any terms that aren't words 
    
    text = text.lower().split()  # Go to lower case and split them apart
    
    manual_stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 
            'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 
            'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 
            'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
            'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 
            'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 
            'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 
            'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 
            'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 
            'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 
            'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 
            'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'}
    text = [w for w in text if not w in manual_stopwords] # remove stopwords
    
    #print(text)
    
    skill_tab = []
    for i in range(0, len(skills)):
        skill_tab.append(skills[i].lower() in text)
    
    return skill_tab

In [4]:
def web_skills_info(url, city):
    '''
    This function will take a url for web development job postings on Indeed.com. 
    It will crawl all of the job postings, tracking key data from each post: 
    'city', 'job_title', 'company_name', 'location', 'summary', and 'salary'. 
    It will then determine which posts use a preset list of typical web skills with
    each term having it's own column.
    
    Input: Indeed first page url for web development search
    
    Output: .tsv file of the pandas dataframe
    '''
    current_page = url
    try:
        page = requests.get(url) # Open up the front page of our search first
    except:
        'Url error - Exiting . . .'
        return
    page_obj = soup(page.text, 'lxml') # Get the html from the site
    
    num_jobs_area = page_obj.find(id = 'searchCount').string.encode('utf-8') # The total number of jobs found
    job_numbers = re.findall('\d+', str(num_jobs_area)) # Extract the total jobs found from the search result
    total_num_jobs = int(job_numbers[-1])
    num_pages = int(total_num_jobs/10)
    if num_pages < 1: num_pages = 1
        
    jobs_df = pd.DataFrame(columns = columns)
    
    for i in range(1,num_pages+1): # Loop through search result pages (starting with the current one)
        print('Getting page ' + str(i))
        if i != 1: # if this isn't the first page, read in the next one
            #Get next job page
            start_num = str((i-1)*10) # Assign the multiplier of 10 to view the pages we want
            current_page = ''.join([url, '&start=', start_num]) #Set the next page of 10 jobs
            html_page = requests.get(current_page) # Get the html page
            page_obj = soup(html_page.text, "lxml")
            
        job_link_area = page_obj.find(id = 'resultsCol') # Locate all of the results for this page
        #if an <a's data-tn-element attribute is 'jobTitle', then it's title attribute is the job title
        job_titles = [title.get('title') for title in job_link_area.find_all(name='a', attrs={'data-tn-element':'jobTitle'})]
        #if a div's id starts with p, then it's data-jk attribute is a subpage name
        job_links = [div.get('data-jk') for div in job_link_area.find_all(name='div', attrs={'id':re.compile("^p")})]
        
        companies = []
        for div in page_obj.find_all(name='div', attrs={'class':'row'}):
            company = div.find_all(name='span', attrs={'class':'company'})
            if len(company) > 0:
                for b in company:
                    companies.append(b.text.strip())
            else:
                sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
                for span in sec_try:
                    companies.append(span.text.strip())
        
        locations = []
        spans = page_obj.findAll('span', attrs={'class': 'location'})
        for span in spans:
            locations.append(span.text)
        
        salaries = []
        for div in page_obj.find_all(name='div', attrs={'class':'row'}):
            try:
                salaries.append(div.find('nobr').text)
            except:
                try:
                    div_two = div.find(name='div', attrs={'class':'sjcl'})
                    div_three = div_two.find('div')
                    salaries.append(div_three.text.strip())
                except:
                    salaries.append('Not_found')
        
        for j in range(0,len(job_links)):
            job_site = current_page + '&vjk=' + job_links[j]
            job_post = [city, job_titles[j], companies[j], locations[j], salaries[j], job_site]
            job_all = job_post + html_tabulate(job_site, columns[6:])
            num = len(jobs_df) + 1
            jobs_df.loc[num] = job_all
            sleep(1) #Slow down server hits
        
    print('Done with collecting the job postings!')    
    print('There were', len(jobs_df), 'jobs successfully found.')
    #print(jobs_df)
    
    jobs_df.to_csv("WPBWebJobs.csv", encoding='utf-8')
    return

The search within 100 miles of West Palm Beach:

In [5]:
web_site = 'https://www.indeed.com/jobs?q=web+%28developer+or+programmer%29&as_ttl=web&l=West+Palm+Beach%2C+FL&radius=100'
web_skills_info(web_site, 'West Palm Beach')

Getting page 1
Getting page 2
Getting page 3
Getting page 4
Getting page 5
Done with collecting the job postings!
There were 75 jobs successfully found.


In [5]:
df_all = pd.read_csv('WPBWebJobs.csv')
df_all.head(5)

Unnamed: 0.1,Unnamed: 0,city,job_title,company_name,location,salary,website,Agile,Android,Angular,...,MongoDB,MySQL,Node,NoSQL,PHP,Ruby,SQL,Windows,WordPress,XML
0,1,West Palm Beach,"Web Programmer Boca Raton, Florida","NCCI Holdings, Inc.","Boca Raton, FL",Not_found,https://www.indeed.com/jobs?q=web+%28developer...,False,False,False,...,False,False,False,False,False,False,True,True,False,False
1,2,West Palm Beach,Backend Web Developer,Vazkor Technologies,"Boynton Beach, FL 33426","$65,000 - $85,000 a year",https://www.indeed.com/jobs?q=web+%28developer...,False,False,False,...,False,True,False,False,True,False,True,False,False,False
2,3,West Palm Beach,Web Designer/Developer,"DDG, Inc.","West Palm Beach, FL",Not_found,https://www.indeed.com/jobs?q=web+%28developer...,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,4,West Palm Beach,Web Design / Programmer,George's Music,"West Palm Beach, FL",Not_found,https://www.indeed.com/jobs?q=web+%28developer...,False,False,False,...,False,True,False,False,True,False,True,False,False,False
4,5,West Palm Beach,"Web Programmer Boca Raton, Florida","NCCI Holdings, Inc.","Boca Raton, FL",Not_found,https://www.indeed.com/jobs?q=web+%28developer...,False,False,False,...,False,False,False,False,False,False,True,True,False,False


## Clean WPBWebJobs.csv to JustKeywords.csv:



In [6]:
df_clust = pd.read_csv('JustKeywords.csv')
df_clust.head(5)

Unnamed: 0,Agile,Android,Angular,Bootstrap,C++,Cloud,CSS,Excel,HTML,iOS,...,Mobile,Python,MongoDB,MySQL,Node,PHP,SQL,Windows,WordPress,XML
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,1,0,...,1,0,0,1,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,0
3,0,0,0,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,...,1,0,0,0,0,1,0,0,1,0


In [7]:
df_clust.shape

(46, 25)

In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(df_clust)
y_kmeans = kmeans.predict(df_clust)

In [9]:
y_kmeans

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1])

In [10]:
sum(y_kmeans) # number in cluster 1

22

In [11]:
len(y_kmeans) - sum(y_kmeans) # number in cluster 0

24

In [12]:
df_cl_tots = df_clust.T
df_cl_tots

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
Agile,0,0,0,0,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,0,0
Android,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Angular,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Bootstrap,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
C++,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Cloud,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CSS,0,1,0,1,1,1,0,1,1,1,...,1,1,0,0,1,1,1,0,0,1
Excel,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HTML,0,1,0,1,1,1,1,1,1,1,...,1,1,1,0,1,1,1,0,1,1
iOS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# create new column with each cluster's totals per keyword
df_cl_tots['clust0'] = 0
df_cl_tots['clust1'] = 0
for i in range(len(df_cl_tots.index)):
    clu0 = 0
    clu1 = 0
    for j in range(len(df_cl_tots.columns)-2):
        clu0 = clu0 + ((1-y_kmeans[j])*df_cl_tots.iloc[i,j])
        clu1 = clu1 + (y_kmeans[j]*df_cl_tots.iloc[i,j])
    df_cl_tots['clust0'][i] = clu0
    df_cl_tots['clust1'][i] = clu1
df_cl_tots

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,clust0,clust1
Agile,0,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,5,8
Android,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,1
Angular,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,2,3
Bootstrap,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,4
C++,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
Cloud,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1
CSS,0,1,0,1,1,1,0,1,1,1,...,0,0,1,1,1,0,0,1,3,22
Excel,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
HTML,0,1,0,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,1,8,22
iOS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [15]:
# Top ten - clust1
df_cl_tots.sort_values('clust1', ascending=False).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,clust0,clust1
CSS,0,1,0,1,1,1,0,1,1,1,...,0,0,1,1,1,0,0,1,3,22
HTML,0,1,0,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,1,8,22
Javascript,1,1,0,0,1,1,0,1,1,1,...,0,0,1,1,0,1,0,1,6,19
JQuery,1,1,0,0,1,1,0,1,0,1,...,0,0,1,0,0,0,0,0,3,14
Mobile,0,1,0,1,1,1,0,1,0,0,...,1,1,1,0,0,0,0,0,5,13
Agile,0,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,5,8
PHP,0,1,1,0,1,0,1,1,0,0,...,0,0,0,1,1,1,1,1,12,7
WordPress,0,0,0,0,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,5,6
Node,0,0,0,0,0,1,0,1,1,0,...,0,0,1,0,0,0,0,0,1,5
Linux,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,3,4


In [16]:
# Top ten - clust0
df_cl_tots.sort_values('clust0', ascending=False).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,clust0,clust1
SQL,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,13,4
PHP,0,1,1,0,1,0,1,1,0,0,...,0,0,0,1,1,1,1,1,12,7
MySQL,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,9,2
HTML,0,1,0,1,1,1,1,1,1,1,...,1,0,1,1,1,0,1,1,8,22
Java,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,7,2
Javascript,1,1,0,0,1,1,0,1,1,1,...,0,0,1,1,0,1,0,1,6,19
Agile,0,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,5,8
WordPress,0,0,0,0,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,5,6
Mobile,0,1,0,1,1,1,0,1,0,0,...,1,1,1,0,0,0,0,0,5,13
Cloud,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1
