# Job Search Page Scraper
This project aims to find out the most popular skills required in a certain role.

Proposed method is to use web scraping (BeautifulSoup + Selenium) to extract many job listings from popular job search websites, then find the most popular words (keywords) used in their description using Tf-Idf techniques as provided by Scikit-Learn's ```CountVectorizer``` and ```TfIdfTransformer```.

## 1. Web scraping
Details of experimentation can be found in ```Scraping_Test.ipynb```.

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from sklearn.feature_extraction.text import TfidfTransformer
from bs4 import BeautifulSoup
from datetime import datetime
import os
import sys
import pandas as pd
import requests
import pprint
import re
import unicodedata

In [2]:
job_ad_sites = pd.read_csv('Config\job_ad_sites.csv')
job_ad_sites.fillna(-999999, inplace=True)
job_ad_sites.Result_item = job_ad_sites.Result_item.astype(int)
job_ad_sites.Title_item = job_ad_sites.Title_item.astype(int)
job_ad_sites.Company_item = job_ad_sites.Company_item.astype(int)
job_ad_sites.Location_item = job_ad_sites.Location_item.astype(int)
job_ad_sites.Description_item = job_ad_sites.Description_item.astype(int)
job_ad_sites.URL_item = job_ad_sites.URL_item.astype(int)

In [6]:
current_datetime = datetime.now()
normal_form = "NFKC"
# Headless option meaning not show browser window
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")

DRIVER_PATH = '<set up Chrome web driver path>'
# driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
# Store stdout as object (restore later)
stdout_obj = sys.stdout
with open("JobReport_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt", "w", encoding = 'utf-8') as report_file:
    # Set everything printed to stdout to file instead
    sys.stdout = report_file
    count = 0
    print("Job Report on " + current_datetime.strftime("%d/%m/%Y %H:%M:%S"))
    print("=================================")
    for index, row in job_ad_sites.iterrows():
        page = requests.get(row['Listing_URL'])
        soup = BeautifulSoup(page.content, 'html.parser')
        if row['Result_attribute'] == "id":
            results = soup.find_all(row['Result_name'], id=row['Result_tag'])[int(row['Result_item'])]
        elif row['Result_attribute'] == 'class':
            results = soup.find_all(row['Result_name'], class_=row['Result_tag'])[int(row['Result_item'])]
        if row['Element_attribute'] == "id":
            job_elems = results.find_all(row['Element_name'], id=row['Element_tag'])
        elif row['Element_attribute'] == 'class':
            job_elems = results.find_all(row['Element_name'], class_=row['Element_tag'])

        for job_elem in job_elems:
            title_elem = None
            company_elem = None
            location_elem = None
            URL = None
            description_elem = None
            content = None
            if row['Title_attribute'] == "id":
                title_elem = job_elem.find_all(row['Title_name'], id=row['Title_tag'])[int(row['Title_item'])]
            elif row['Title_attribute'] == 'class':
                if len(job_elem.find_all(row['Title_name'], class_=row['Title_tag'])) > 0:
                    title_elem = job_elem.find_all(row['Title_name'], class_=row['Title_tag'])[int(row['Title_item'])]
            elif row['Title_attribute'] == 'none':
                title_elem = job_elem.find_all(row['Title_name'])[row['Title_item']]
            if row['Company_attribute'] == "id":
                company_elem = job_elem.find_all(row['Company_name'], id=row['Company_tag'])[int(row['Company_item'])]
            elif row['Company_attribute'] == 'class':
                if len(job_elem.find_all(row['Company_name'], class_=row['Company_tag'])) > 0:
                    company_elem = job_elem.find_all(row['Company_name'], class_=row['Company_tag'])[int(row['Company_item'])]
            if row['Location_attribute'] == "id":
                location_elem = job_elem.find_all(row['Location_name'], id=row['Location_tag'])[int(row['Location_item'])]
            elif row['Location_attribute'] == 'class':
                if len(job_elem.find_all(row['Location_name'], class_=row['Location_tag'])) > 0:
                    location_elem = job_elem.find_all(row['Location_name'], class_=row['Location_tag'])[int(row['Location_item'])]
            if row['URL_attribute'] == "id":
                URL = job_elem.find('a', id=row['URL_tag'])
            elif row['URL_attribute'] == 'class':
                URL = job_elem.find('a', class_=row['URL_tag'])
            elif row['URL_attribute'] == "none":
                URL = job_elem.find('a')
            elif row['URL_attribute'] == "self":
                URL = job_elem

            if not any((title_elem, company_elem, location_elem, URL)):
                count += 1
                continue

            URL = URL['href']

            if str.startswith(URL, '/'):
                URL = row['Prefix_URL'] + URL

            try:
                page_content = driver.get(URL)
                if row['Description_attribute'] == 'id':
                    content = driver.find_element_by_id(row['Description_tag'])
                elif row['Description_attribute'] == 'class':
                    content = driver.find_element_by_class_name(row['Description_tag'])
                if not content is None:
                    description_elem = content.text
            except KeyError:
                continue
            print('Source: ' + row['Site'])
            if not title_elem is None:
                print('Title: ' + unicodedata.normalize(normal_form, title_elem.text.strip()))
            else:
                print('Title: Not found')
            if not company_elem is None:
                print('Company: ' + unicodedata.normalize(normal_form, company_elem.text.strip()))
            else:
                print('Company: Not found')
            if not location_elem is None:
                print('Location: ' + unicodedata.normalize(normal_form, location_elem.text.strip()))
            else:
                print('Location: Not found')
            if not description_elem is None:
                print()
                print('Description')
                print('===========')
                print(description_elem)
                print()
            else:
                print('Description: Not found')
            print("Link: " + unicodedata.normalize(normal_form, URL))
            print('=======================================================================================================================================================')
            print()

driver.quit()
# Restore stdout
sys.stdout = stdout_obj
print("Finished successfully")

Finished successfully


Try to refactor the code into functions:

In [3]:
def find_all_by(row, upper_level_object, extraction_level):
    if row['{}_attribute'.format(extraction_level)] == "id":
        if len(upper_level_object.find_all(row['{}_name'.format(extraction_level)], id=row['{}_tag'.format(extraction_level)])) > 0:
            return upper_level_object.find_all(row['{}_name'.format(extraction_level)], id=row['{}_tag'.format(extraction_level)])[row['{}_item'.format(extraction_level)]]
    elif row['{}_attribute'.format(extraction_level)] == "class":
        if len(upper_level_object.find_all(row['{}_name'.format(extraction_level)], class_=row['{}_tag'.format(extraction_level)])) > 0:
            return upper_level_object.find_all(row['{}_name'.format(extraction_level)], class_=row['{}_tag'.format(extraction_level)])[row['{}_item'.format(extraction_level)]]
    elif row['{}_attribute'.format(extraction_level)] == "none":
        if len(upper_level_object.find_all(row['{}_name'.format(extraction_level)])) > 0:
            return upper_level_object.find_all(row['{}_name'.format(extraction_level)])[int(row['{}_item'.format(extraction_level)])]
    elif row['{}_attribute'.format(extraction_level)] == "self":
        return upper_level_object
    return None

In [4]:
def set_up_headless_driver():
    # Headless option meaning not show browser window
    options = Options()
    options.headless = True
    options.add_argument("--window-size=1920,1200")

    DRIVER_PATH = 'C:\\Users\\Adriel\\Downloads\\chromedriver_win32\\chromedriver'
#     driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    return webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

In [5]:
def print_format_unicode(heading, bs_object, report_file, normal_form="NFKC"):
    if not bs_object is None:
        report_file.write(heading + unicodedata.normalize(normal_form, bs_object.text.strip()) + '\n')
        return heading + unicodedata.normalize(normal_form, bs_object.text.strip()) + '\n'
    else:
        report_file.write(heading + 'Not found\n')
        return heading + 'Not found\n'

In [22]:
current_datetime = datetime.now()
driver = set_up_headless_driver()
description_list = []
count = 1

with open("JobReport_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt", "w", encoding = 'utf-8') as report_file:
    report_file.write("Job Report on " + current_datetime.strftime("%d/%m/%Y %H:%M:%S") + '\n')
    report_file.write("=================================\n")
    for index, row in job_ad_sites.iterrows():
        page = requests.get(row['Listing_URL'])
        soup = BeautifulSoup(page.content, 'html.parser')
        results = find_all_by(soup, 'Result')
        if row['Element_attribute'] == 'class':
            job_elems = results.find_all(row['Element_name'], class_=row['Element_tag'])
        elif row['Element_attribute'] == 'none':
            job_elems = results.find_all(row['Element_name'])
        
        for job_elem in job_elems:
            description_elem = None
            content = None
            title_elem = find_all_by(job_elem, 'Title')
            company_elem = find_all_by(job_elem, 'Company')
            location_elem = find_all_by(job_elem, 'Location')
            URL = find_all_by(job_elem, 'URL')

            if not any((title_elem, company_elem, location_elem, URL)):
                continue

            URL = URL['href']

            if str.startswith(URL, '/'):
                URL = row['Prefix_URL'] + URL

            try:
                page_content = driver.get(URL)
                if row['Description_attribute'] == 'id':
                    content = driver.find_element_by_id(row['Description_tag'])
                elif row['Description_attribute'] == 'class':
                    content = driver.find_element_by_class_name(row['Description_tag'])
                elif row['Description_attribute'] == 'xpath':
                    content = driver.find_element_by_xpath(row['Description_tag'])
                if not content is None:
                    description_elem = content.text
            except KeyError:
                continue
            report_file.write("Job number: " + str(count) + "\n")
            count += 1
            report_file.write('Source: ' + row['Site'] + '\n')
            print_format_unicode('Title: ', title_elem, report_file)
            print_format_unicode('Company: ', company_elem, report_file)
            print_format_unicode('Location: ', location_elem, report_file)
            if not description_elem is None:
                report_file.write('\n')
                report_file.write('Description\n')
                report_file.write('===========\n')
                report_file.write(description_elem)
                description_list.append(description_elem)
                report_file.write('\n')
            else:
                report_file.write('Description: Not found\n')
            report_file.write("Link: " + unicodedata.normalize("NFKC", URL) + '\n')
            report_file.write('=======================================================================================================================================================\n')
            report_file.write('\n')

driver.quit()
print("Finished successfully")


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".FYwKg _3gJU3_6 _1yPon_6"}
  (Session info: headless chrome=87.0.4280.88)


In [12]:
len(description_elem)

1716

## 2. Keyword extraction using TfIdf values
### 2.1 Vocabulary list and count (Tf values)
We now aim to extract the most used keyword considering all the job postings. This will hopefully give us some idea as to the most important skills associated with a certain role.

Credits to tutorial on [FreeCodeCamp](https://www.freecodecamp.org/news/how-to-extract-keywords-from-text-with-tf-idf-and-pythons-scikit-learn-b2a0f3d7e667/).

First of all, we will aim to create a vocabulary list of all job listings of a certain date, together with the number of occurences in each listing, using the ```CountVectorizer``` of ```scikit-learn```.

In [6]:
import re
def pre_process(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters that are of no use
    # Mind the apostrophe ’ is not the same as the quote '
    text = re.sub('(\\d|[^+#.’\\w\'])+',' ', text)
    # Remove full stop (recognised by space after it or end of string)
    text = re.sub('\\.\s', ' ', text)
    text = re.sub('\\.$', ' ', text)
    text = re.sub('’','\'', text)
    return text

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stopwords_file_path):
    # load stop words
    with open(stopwords_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)
        
# load stopwords
stopwords = get_stop_words("Config\stopwords.txt")

In [8]:
count_vectorizer = CountVectorizer(max_df=0.85, stop_words = stopwords)

In [22]:
word_count_vector = count_vectorizer.fit_transform(description_list)

In [21]:
word_count_vector.shape

(67, 3935)

In [22]:
list(count_vectorizer.vocabulary_.keys())[:10]

['belong',
 'twilio',
 'who',
 'rapidly',
 'growing',
 'leader',
 'cloud',
 'communications',
 'software',
 'market']

We see that company names can mix into our vocabulary list, let's try adding the company name into our stop word list, calling it ```stopwords_temp.txt```.

In [8]:
# Set up temp stop word list with company names from current search
with open("stopwords_temp.txt", "w") as ft:
    f = open("stopwords.txt",'r')
    ft.write(f.read() + '\n')
    f.close()
current_datetime = datetime.now()
driver = set_up_headless_driver()
description_list = []
count = 1

with open("JobReport_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt", "w", encoding = 'utf-8') as report_file:
    report_file.write("Job Report on " + current_datetime.strftime("%d/%m/%Y %H:%M:%S") + '\n')
    report_file.write("=================================\n")
    for index, row in job_ad_sites.iterrows():
        page = requests.get(row['Listing_URL'])
        soup = BeautifulSoup(page.content, 'html.parser')
        results = find_all_by(row, soup, 'Result')
        if row['Element_attribute'] == 'class':
            if row['Element_item'] == 'sub':
                job_elems = results.find(row['Element_name'], class_=row['Element_tag']).find_all(row['Element_name'], recursive=False)
            else:
                job_elems = results.find_all(row['Element_name'], class_=row['Element_tag'])
        elif row['Element_attribute'] == 'none':
            job_elems = results.find_all(row['Element_name'])
        
        for job_elem in job_elems:
            description_elem = None
            content = None
            title_elem = find_all_by(row, job_elem, 'Title')
            company_elem = find_all_by(row, job_elem, 'Company')
            location_elem = find_all_by(row, job_elem, 'Location')
            URL = find_all_by(row, job_elem, 'URL')

            if not any((title_elem, company_elem, location_elem, URL)):
                continue

            URL = URL['href']

            if str.startswith(URL, '/'):
                URL = row['Prefix_URL'] + URL

            try:
                page_content = driver.get(URL)
                if row['Description_attribute'] == 'id':
                    content = driver.find_element_by_id(row['Description_tag'])
                elif row['Description_attribute'] == 'class':
                    content = driver.find_element_by_class_name(row['Description_tag'])
                elif row['Description_attribute'] == 'xpath':
                    content = driver.find_element_by_xpath(row['Description_tag'])
                if not content is None:
                    description_elem = content.text
            except KeyError:
                continue
            report_file.write("Job number: " + str(count) + "\n")
            count += 1
            report_file.write('Source: ' + row['Site'] + '\n')
            print_format_unicode('Title: ', title_elem, report_file)
            print_format_unicode('Company: ', company_elem, report_file)
            print_format_unicode('Location: ', location_elem, report_file)
            if not description_elem is None:
                report_file.write('\n')
                report_file.write('Description\n')
                report_file.write('===========\n')
                report_file.write(description_elem)
                description_list.append(pre_process(description_elem))
                report_file.write('\n')
            else:
                report_file.write('Description: Not found\n')
            report_file.write("Link: " + unicodedata.normalize("NFKC", URL) + '\n')
            report_file.write('=======================================================================================================================================================\n')
            report_file.write('\n')
            
            # Add company name into stopword list
            if not company_elem is None:
                with open("stopwords_temp.txt", "a") as ft:
                    ft.write(re.sub('(\s+|\n+)', '\n', re.sub('\s+\W+\s+', '\n', company_elem.text.strip())) + '\n')

driver.quit()
print("Finished successfully")


Finished successfully


In [9]:
# load stopwords
stopwords_new = get_stop_words("stopwords.txt")
count_vectorizer_new = CountVectorizer(max_df=0.85, stop_words = stopwords_new)
word_count_vector_new = count_vectorizer_new.fit_transform(description_list)



In [11]:
list(count_vectorizer_new.vocabulary_.keys())[:10]

['belong',
 'twilio',
 'who',
 'rapidly',
 'growing',
 'leader',
 'cloud',
 'communications',
 'software',
 'market']

In [12]:
word_count_vector_new.shape

(98, 4141)

### 2.2 Tf-Idf value calculation
#### 2.2.1 Calculate Idf values from the library
The Tf-Idf value will be the importance of each word in the text (in this case, the job descriptions).

Term frequency of **a document** (Tf) is how frequent each vocabulary has appeared in the selected text, whereas the inverse document frequency (Idf) of the **library of documents** is how frequent each vocabulary has appeared in the library  (in this case, job descriptions across different sites).

The Tf value should be directly proportional to the importance of each vocabulary, whereas the Idf value should be inversely proportional.

The Tf-Idf value is the product of the two values.

Now, we will use the ```TfIdf_Transformer``` in ```scikit-learn``` to obtain the Idf values of each word by fitting to the word count matrix we obtained in Section 2.1 .

In [38]:
tf_idf_vector1 = tfidf_transformer.transform(count_vectorizer.transform([description_all]))
sorted_items1 = sort_coo(tf_idf_vector1.tocoo())

# Extract only top 10 results
keywords1 = extract_top_n_from_vector(count_vectorizer.get_feature_names(), sorted_items1, 10)

In [39]:
keywords1

{'team': 0.186,
 'web': 0.128,
 'job': 0.123,
 'technology': 0.113,
 'data': 0.112,
 'working': 0.107,
 'end': 0.107,
 'product': 0.102,
 'company': 0.096,
 'design': 0.095}

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector_new)

TfidfTransformer()

In [13]:
tfidf_transformer.idf_

array([3.95491028, 1.95343028, 2.23214368, ..., 4.87120101, 4.87120101,
       4.87120101])

In [14]:
tfidf_transformer.idf_.shape

(4049,)

In [15]:
word_count_vector.shape

(95, 4049)

#### 2.2.2 Fitting to Tf value matrix
After that, we can use the TfIdf transformer to fit into *each* job listing (which is the selected text) to obtain the TfIdf value of each word. This will tell us the important words in each job listing.

We sort the words in the vector in **descending** order of TfIdf values.

In [13]:
feature_names = count_vectorizer_new.get_feature_names()

In [14]:
print(len(feature_names))

4184


In [17]:
test_doc = description_list[0]
for description in description_list:
    tf_idf_vector = tfidf_transformer.transform(count_vectorizer_new.transform([test_doc]))
    # Sort the TfIdf vector by descending order of scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())

    # Extract only top 10 results
    keywords = extract_top_n_from_vector(feature_names, sorted_items, 10)

In [28]:
print(type(tf_idf_vector))

<class 'scipy.sparse.csr.csr_matrix'>


In [28]:
print(sort_coo(tf_idf_vector.tocoo()))

[(2722, 0.3921116982223662), (1988, 0.25524316672679276), (3413, 0.25079581059176337), (3943, 0.22262785395658632), (1020, 0.17392571777476237), (1804, 0.14750135655592328), (3737, 0.1425711194708222), (770, 0.1425711194708222), (916, 0.14217620235678483), (2340, 0.13968491864324195), (596, 0.13729534882473024), (3801, 0.1307038994074554), (724, 0.1307038994074554), (3646, 0.12478893455795), (1712, 0.12284506537941935), (3960, 0.12228397277554406), (2954, 0.12228397277554406), (1559, 0.12228397277554406), (1900, 0.11575297028923275), (1706, 0.11278558170936821), (1337, 0.11041675271217724), (3509, 0.10590504503660662), (2889, 0.10590504503660662), (3633, 0.10491762417888391), (719, 0.10420395209193148), (3447, 0.10199682608026595), (2904, 0.10199682608026595), (2365, 0.10199682608026595), (3762, 0.0985495326488104), (182, 0.09331020729098502), (3820, 0.0901296060168991), (1534, 0.0901296060168991), (950, 0.0901296060168991), (736, 0.0901296060168991), (2915, 0.08778690276713834), (1274

In [10]:
def sort_coo(coo_vector):
    '''
    Sorts the coordinate vector in descending order of values.
    '''
    tuples = zip(coo_vector.col, coo_vector.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [11]:
def extract_top_n_from_vector(feature_names, sorted_items, topn=10):
    '''
    Get the feature names and TfIdf score of top n items
    '''
    # Extract top n items from vector
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    # Word index and corresponding TfIdf score
    for idx, score in sorted_items:
        # Keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
    # Create a dictionary of (feature, score)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

In [41]:
# Sort the TfIdf vector by descending order of scores
sorted_items = sort_coo(tf_idf_vector.tocoo())

# Extract only top 10 results
keywords = extract_top_n_from_vector(feature_names, sorted_items, 10)

# Print results
print("\n=====Doc=====")
print(test_doc)
print("\n===Keywords===")
for k in keywords:
    print(k, keywords[k])


=====Doc=====
description because you belong at twilio the who what why and where twilio is a rapidly growing leader in the cloud communications software market and we are looking for top tier proven sales account executives aes who are looking to grow their career in the fast growing cloud communications platform market this role reports to senior manager growth mid market based in singapore who as strategic mid market account executive you will be responsible for selling to mid market customers developing a relationship as a trusted advisor and deeply understanding their unique challenges and goals you will contribute to our business growth in a fast paced collaborative and fun atmosphere the right candidate will have a proven consultative sales process to discover and close new logos our aes develop an understanding of prospects' businesses organize and conduct sales presentations at prospective and current customers' offices and represent twilio in a consistent effective and profe

In [33]:
# Sort the TfIdf vector by descending order of scores
sorted_items = sort_coo(tf_idf_vector.tocoo())

# Extract only top 10 results
keywords = extract_top_n_from_vector(count_vectorizer.get_feature_names(), sorted_items, 10)

# Print results
print(keywords)

{'portal': 0.392, 'job': 0.255, 'specified': 0.251, 'web': 0.223, 'develop': 0.174, 'implement': 0.148, 'traffic': 0.143, 'construction': 0.143, 'database': 0.142, 'mobile': 0.14}


We combine with the scraping code to produce a simplified report with 10 keywords of each job instead of whole description.

In [12]:
current_datetime = datetime.now()
driver = set_up_headless_driver()
stopwords = get_stop_words("Config\stopwords.txt")
count_vectorizer = CountVectorizer(max_df=0.85, stop_words = stopwords)
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
site_list = []
title_list = []
company_list = []
location_list = []
description_list = []
link_list = []
count = 1

with open("JobReport_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt", "w", encoding = 'utf-8') as report_file:
    # TODO Change job role to variable
    report_file.write("Software Developer Job Report on " + current_datetime.strftime("%d/%m/%Y %H:%M:%S") + '\n')
    report_file.write("=================================\n")
    
    for index, row in job_ad_sites.iterrows():
        page = requests.get(row['Listing_URL'])
        soup = BeautifulSoup(page.content, 'html.parser')
        results = find_all_by(row, soup, 'Result')
        if row['Element_attribute'] == 'class':
            if row['Element_item'] == 'sub':
                job_elems = results.find(row['Element_name'], class_=row['Element_tag']).find_all(row['Element_name'], recursive=False)
            else:
                job_elems = results.find_all(row['Element_name'], class_=row['Element_tag'])
        elif row['Element_attribute'] == 'none':
            job_elems = results.find_all(row['Element_name'])
        
        for job_elem in job_elems:
            description_elem = None
            content = None
            title_elem = find_all_by(row, job_elem, 'Title')
            company_elem = find_all_by(row, job_elem, 'Company')
            location_elem = find_all_by(row, job_elem, 'Location')
            URL = find_all_by(row, job_elem, 'URL')

            if not any((title_elem, company_elem, location_elem, URL)):
                continue

            URL = URL['href']

            if str.startswith(URL, '/'):
                URL = row['Prefix_URL'] + URL

            try:
                page_content = driver.get(URL)
                if row['Description_attribute'] == 'id':
                    content = driver.find_element_by_id(row['Description_tag'])
                elif row['Description_attribute'] == 'class':
                    content = driver.find_element_by_class_name(row['Description_tag'])
                elif row['Description_attribute'] == 'xpath':
                    content = driver.find_element_by_xpath(row['Description_tag'])
                if not content is None:
                    description_elem = content.text
            except KeyError:
                continue
            report_file.write("Job number: " + str(count) + "\n")
            count += 1
            
            report_file.write('Source: ' + row['Site'] + '\n')
            site_list.append('Source: ' + row['Site'] + '\n')
            title_list.append(print_format_unicode('Title: ', title_elem, report_file))
            company_list.append(print_format_unicode('Company: ', company_elem, report_file))
            location_list.append(print_format_unicode('Location: ', location_elem, report_file))
            
            if not description_elem is None:
                report_file.write('\n')
                report_file.write('Description\n')
                report_file.write('===========\n')
                report_file.write(description_elem)
                description_list.append(pre_process(description_elem))
                report_file.write('\n')
            else:
                report_file.write('Description: Not found\n')
            report_file.write("Link: " + unicodedata.normalize("NFKC", URL) + '\n')
            link_list.append("Link: " + unicodedata.normalize("NFKC", URL) + '\n')
            report_file.write('=======================================================================================================================================================\n')
            report_file.write('\n')

# Extract keywords of this job description
word_count_vector = count_vectorizer.fit_transform(description_list)
tfidf_transformer.fit(word_count_vector)

with open("JobKeyword_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt", "w", encoding = 'utf-8') as keyword_file:
    # TODO Change job role to variable
    keyword_file.write("Software Developer Job Report (Extract) on " + current_datetime.strftime("%d/%m/%Y %H:%M:%S") + '\n')
    keyword_file.write("=================================\n")
    
    for i in range(len(site_list)):
        keyword_file.write("Job number: " + str(i+1) + "\n")
        keyword_file.write(site_list[i])
        keyword_file.write(title_list[i])
        keyword_file.write(company_list[i])
        keyword_file.write(location_list[i])
        
        tf_idf_vector = tfidf_transformer.transform(count_vectorizer.transform([description_list[i]]))
        # Sort the TfIdf vector by descending order of scores
        sorted_items = sort_coo(tf_idf_vector.tocoo())
        keywords = extract_top_n_from_vector(count_vectorizer.get_feature_names(), sorted_items, 10)
        
        keyword_string = ""
        for k in keywords:
            keyword_string += k.title() + " "
        keyword_file.write(keyword_string + "\n")
        keyword_file.write(link_list[i])
        keyword_file.write('=======================================================================================================================================================\n')
        keyword_file.write('\n')
        
driver.quit()

print("Finished successfully")




Finished successfully
