<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>

# Vector Representations
## *Data Science Unit 4 Sprint 2 Assignment 2*

In [1]:
import re
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

## 1) *Optional:* Scrape 100 Job Listings that contain the title "Data Scientist" from indeed.com

At a minimum your final dataframe of job listings should contain
- Job Title
- Job Description

If you choose to not to scrape the data, there is a CSV with outdated data in the directory. Remeber, if you scrape Indeed, you're helping yourself find a job. ;)

In [21]:
import re
import json
from bs4 import BeautifulSoup
from selenium import webdriver



def get_soup(url):
    """
    Given the url of a page, this function returns the soup object.
    
    Parameters:
        url: the link to get soup object for
    
    Returns:
        soup: soup object
    """
    driver = webdriver.Chrome()
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    driver.close()
    
    return soup



def grab_job_links(soup):
    """
    Grab all non-sponsored job posting links from a Indeed search result page using the given soup object
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
        urls: a python list of job posting urls
    
    """
    urls = []
    
    # Loop thru all the posting links
    for link in soup.find_all('div', {'class': 'title'}):
        # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
        partial_url = link.a.get('href')
        # This is a partial url, we need to attach the prefix
        url = 'https://indeed.com' + partial_url
        # Make sure this is not a sponsored posting
        urls.append(url)
    
    return urls



def get_urls(query, num_pages, location):
    """
    Get all the job posting URLs resulted from a specific search.
    
    Parameters:
        query: job title to query
        num_pages: number of pages needed
        location: city to search in
    
    Returns:
        urls: a list of job posting URL's (when num_pages valid)
        max_pages: maximum number of pages allowed ((when num_pages invalid))
    """
    # We always need the first page
    base_url = 'https://indeed.com/jobs?q={}&l={}'.format(query, location)
    soup = get_soup(base_url)
    urls = grab_job_links(soup)
    
    # Get the total number of postings found 
    posting_count_string = soup.find(name='div', attrs={'id':"searchCount"}).get_text()
    posting_count_string = posting_count_string[posting_count_string.find('of')+2:].strip()
    #print('posting_count_string: {}'.format(posting_count_string))
    #print('type is: {}'.format(type(posting_count_string)))
    
    try:
        posting_count = int(posting_count_string)
    except ValueError: # deal with special case when parsed string is "360 jobs"
        posting_count = int(re.search('\d+', posting_count_string).group(0))
        #print('posting_count: {}'.format(posting_count))
        #print('\ntype: {}'.format(type(posting_count)))
    finally:
        posting_count = 330 # setting to 330 when unable to get the total
        pass
    
    # Limit nunmber of pages to get
    max_pages = round(posting_count / 10) - 3
    if num_pages > max_pages:
        print('returning max_pages!!')
        return max_pages
    
        # Additional work is needed when more than 1 page is requested
    if num_pages >= 2:
        # Start loop from page 2 since page 1 has been dealt with above
        for i in range(2, num_pages+1):
            num = (i-1) * 10
            base_url = 'https://indeed.com/jobs?q={}&l={}&start={}'.format(query, location, num)
            try:
                soup = get_soup(base_url)
                # We always combine the results back to the list
                urls += grab_job_links(soup)
            except:
                continue

    # Check to ensure the number of urls gotten is correct
    #assert len(urls) == num_pages * 10, "There are missing job links, check code!"

    return urls     



def get_posting(url):
    """
    Get the text portion including both title and job description of the job posting from a given url
    
    Parameters:
        url: The job posting link
        
    Returns:
        title: the job title (if "data scientist" is in the title)
        posting: the job posting content    
    """
    # Get the url content as BS object
    soup = get_soup(url)
    
    # The job title is held in the h3 tag
    title = soup.find(name='h3').getText().lower()
    posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()

    return title, posting.lower()

        
    #if 'data scientist' in title:  # We'll proceed to grab the job posting text if the title is correct
        # All the text info is contained in the div element with the below class, extract the text.
        #posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()
        #return title, posting.lower()
    #else:
        #return False
    
        # Get rid of numbers and symbols other than given
        #text = re.sub("[^a-zA-Z'+#&]", " ", text)
        # Convert to lower case and split to list and then set
        #text = text.lower().strip()
    
        #return text



def get_data(query, num_pages, location='United States'):
    """
    Get all the job posting data and save in a json file using below structure:
    
    {<count>: {'title': ..., 'posting':..., 'url':...}...}
    
    The json file name has this format: ""<query>.json"
    
    Parameters:
        query: Indeed query keyword such as 'Data Scientist'
        num_pages: Number of search results needed
        location: location to search for
    
    Returns:
        postings_dict: Python dict including all posting data
    
    """
    # Convert the queried title to Indeed format
    query = '+'.join(query.lower().split())
    
    postings_dict = {}
    urls = get_urls(query, num_pages, location)
    
    #  Continue only if the requested number of pages is valid (when invalid, a number is returned instead of list)
    if isinstance(urls, list):
        num_urls = len(urls)
        for i, url in enumerate(urls):
            try:
                title, posting = get_posting(url)
                postings_dict[i] = {}
                postings_dict[i]['title'], postings_dict[i]['posting'], postings_dict[i]['url'] = \
                title, posting, url
            except: 
                continue
            
            percent = (i+1) / num_urls
            # Print the progress the "end" arg keeps the message in the same line 
            print("Progress: {:2.0f}%".format(100*percent), end='\r')

        # Save the dict as json file
        file_name = query.replace('+', '_') + '.json'
        with open(file_name, 'w') as f:
            json.dump(postings_dict, f)
        
        print('All {} postings have been scraped and saved!'.format(num_urls))    
        #return postings_dict
    else:
        print("Due to similar results, maximum number of pages is only {}. Please try again!".format(urls))


In [22]:
 get_data('data journalist', 1, location='United States')

All 15 postings have been scraped and saved!


In [None]:
df = pd.read_csv('data/job_listings.csv', encoding = 'utf-8')

In [None]:
df.head()

## 2) Use Spacy to tokenize / clean the listings 

In [None]:
from bs4 import BeautifulSoup

def clean_html(text):
    soup = BeautifulSoup(text)
    clean_text = soup.get_text()
    return clean_text

In [None]:
df['no_html'] = df['description'].apply(lambda x: clean_html(x))

In [None]:
import re
def clean_hex(text):
    new_text = re.sub(r"\\[a-z][a-z]?[0-9]+",'', text)
#     new_text = re.sub(r'[^a-zA-Z ^0-9]', '', new_text)
    new_text = re.sub("[!@#$+%*:()'-]",'',new_text) # remove punc.
    new_text = re.sub(r'\d+','',new_text)# remove numbers
    new_text = re.sub(r"\n", " ", new_text)
    new_text = re.sub(r"\\n", " ", new_text)
    new_text = new_text.replace('\\', "")
    new_text = new_text.strip("'b")
    new_text = new_text.strip("'")
    new_text = new_text.replace('"', "")
#     new_text = re.sub("[!@#$+%*:()'-]",'',new_text)
#     new_text = re.sub(r'\d+','',new_text)
    new_text = new_text.lower()
    new_text = re.sub(' +', ' ', new_text)
    return new_text

In [None]:
df['no_html'] = df['no_html'].apply(lambda x: clean_hex(x))

In [None]:
df['no_html']

In [None]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [None]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

In [None]:
df['tokens'] = df['no_html'].apply(get_lemmas)

In [None]:
df['tokens']

## 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

data = df['no_html']

vect = CountVectorizer(stop_words='english', min_df = 0.05, max_df= 0.90)
vect.fit(data)

sparse_dtm = vect.transform(data)

dtm = pd.DataFrame(sparse_dtm.todense(), columns=vect.get_feature_names())

In [None]:
dtm

## 4) Visualize the most common word counts

In [None]:
from collections import Counter

def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [None]:
wc = count(df['tokens'])

In [None]:
wc.head(20)

In [None]:
import squarify
import matplotlib.pyplot as plt

wc_top20 = wc[wc['rank'] <= 20]

squarify.plot(sizes=wc_top20['pct_total'], label=wc_top20['word'], alpha=.8 )
plt.axis('off')
plt.show()

## 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = df['no_html']
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english')

# Create a vocabulary and get word counts per document
sparse = tfidf.fit_transform(data)

# Print word counts

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

## 6) Create a NearestNeighbor Model. Write the description of your ideal datascience job and query your job listings. 

In [None]:
# Instantiate
from sklearn.neighbors import NearestNeighbors


# Fit on TF-IDF Vectors
nn  = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

In [None]:
nn.kneighbors([dtm.iloc[0]])

In [None]:
ideal_job = [ """Data journalist. Looking to hire an analytical journalist with strong grasp of data analysis, data storytelling and data visualizations. Is able to find news
stories in large volumes of data and create thought-provoking and revealing visuals with the data. Strong reporting skills. Is proficient in Python, pandas, d3, graphing libraries,
html, javascript, matplotlib."""]


In [None]:
new = tfidf.transform(ideal_job)

nn.kneighbors(new.todense())

In [None]:
data[147]

## Stretch Goals

 - Try different visualizations for words and frequencies - what story do you want to tell with the data?
 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.
 - Create a labeled dataset - which jobs will you apply for? Train a model to select the jobs you are most likely to apply for. :) 