# Importing Packages

In [1]:
from bs4 import BeautifulSoup

In [2]:
import requests

In [3]:
from selenium import webdriver

In [4]:
import statistics

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\The
[nltk_data]     Creator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import time

In [7]:
import math

# Defining functions

In [8]:
#scraping the website using Selenium web driver and BeautifulSoup for the total number of pages of job postings for the specified title
def getTotalpages(jobTitle):
    url = 'https://www.kariyer.net/is-ilanlari?kw=' + jobTitle #creating the full URL for the job postings search page
#creating an instance of the Firefox webdriver(the instructions of use for each browser are simple and can be found in the Selenium documentation)
    driver = webdriver.Firefox(executable_path = r'./geckodriver.exe') 
    driver.get(url) #navigating the webdriver to the specified URL
    page = driver.page_source #extracting the page source of the current webdriver page
    soup = BeautifulSoup(page, 'html.parser') #parsing the HTML of the page
    totPages = 0
    for b in list(l.find('button') for l in list(li for li in soup.find('ul', {'aria-label': 'Pagination'}).find_all('li', {'role': 'presentation'}))):
        if b:
#getting the total number of pages situated at the 'aria-setsize' attribute of the 'li' element in the page source code
            totPages = int(b['aria-setsize']) 
            break
    driver.close()
    return totPages

In [9]:
#extracting URLs of job postings for the specified position
def jobUrls_kariyer(jobTitle,totalPages):
    if totalPages != 1:
        for pNmbr in range(1,totalPages + 1):
#delaying the execution of the program by 1 second for each iteration in order to avoid slowing down the website
            time.sleep(1)         
            url = 'https://www.kariyer.net/is-ilanlari?kw=' + jobTitle + '&cp=' + str(pNmbr)
            driver = webdriver.Firefox(executable_path = r'./geckodriver.exe')
            driver.get(url)
            page = driver.page_source
            soup = BeautifulSoup(page, 'html.parser')
#finding all the anchor tags with the class name 'k-ad-card rounded' that contain the job listings
            jListings = soup.find_all('a', class_ = 'k-ad-card rounded')
#creating a list of URLs for each job listing by extracting the href attribute of each anchor tag and concatenating it with the base URL
            jListingsRefs = ['https://www.kariyer.net/' + ref['href'] for ref in jListings] 
            driver.close()
    else:
            time.sleep(1)        
            url = 'https://www.kariyer.net/is-ilanlari?kw=' + jobTitle
            driver = webdriver.Firefox(executable_path = r'./geckodriver.exe')
            driver.get(url)
            page = driver.page_source
            soup = BeautifulSoup(page, 'html.parser')
            jListings = soup.find_all('a', class_ = 'k-ad-card rounded')
            jListingsRefs = ['https://www.kariyer.net/' + ref['href'] for ref in jListings]
            driver.close()
    return jListingsRefs

In [10]:
#collecting all job descriptions from each URL of the job postings that were previously extracted
def jobDescriptions(listings):
    jobDescriptions = []
    for url in listings: 
        jPage = requests.get(url)
        jSoup = BeautifulSoup(jPage.content, 'html.parser')
        jDescription = jSoup.find_all('div' , attrs={'class' : 'genel-nitelikler' }) 
        for jD in jDescription:
            if 'GENEL NİTELİKLER VE İŞ TANIMI' in jD:
                jD = jD.replace('GENEL NİTELİKLER VE İŞ TANIMI','')
            if 'QUALIFICATIONS AND JOB DESCRIPTION' in jD:
                jD = jD.replace('QUALIFICATIONS AND JOB DESCRIPTION','')
            jobDescriptions.append(jD.text)
        time.sleep(1) 
    return jobDescriptions

In [11]:
#removing digits
def removeDigits(word):
    letters = 'abcdefghijklmnopqrstuvwxyzğıöşçüâ'
    for letter in word: #iterating through each letter of a word looking for digits to remove
        if letter not in letters:
            word = word.replace(letter,'') 
    return(word)

In [12]:
#pre-processing and cleaning the data
def preprocess(jobDescriptions):
    njd = []
    oneTxt = ''
    stopWords = stopwords.words('turkish') #turkish stopwords from nltk
    nstopWords = ['veveya','vb','vs','yıl','e','bana', 'ar', 'ge', 'x','arıyoruz', '', 'ten', 'com', 'ş', 'ch', 'abl', 'mak', 'fon', 'hakkında', 'bilgi', 'sahibi', 'konusunda '] #noise words
    with open('./turkce-stop-words.txt','r', encoding='utf-8') as trstopwordsFile :
#importing turkish stopwords from Ahmet Aksoy's trstop project(link: https://github.com/ahmetax/trstop/blob/master/dosyalar/turkce-stop-words)
        trstopwords = trstopwordsFile.read() 
    trstopwords = trstopwords.split('\n')
#importing english stopwords from nltk because some job postings might be written in english
    enstopWords = stopwords.words('english')
#grouping all stopswords found
    stopWords.extend(nstopWords)
    stopWords.extend(enstopWords)
    stopWords.extend(trstopwords)
    stopWords = list(set(stopWords)) #getting rid of duplicate stopwords

#joing all job descriptions into one text in order to be tokenized later on  
    oneTxt = ' '.join([description for description in jobDescriptions])
    oneTxt = oneTxt.strip() #removing any leading or trailing spaces
#replacing special charcters with white spaces because some suffixes might be joined with words
    for char in ',/:()[]{}#*-+.&;':
        if char in oneTxt:
            oneTxt = oneTxt.replace(char, ' ')

#tokenizing the text(nltk tokenize could have been used but wouldn't have made much difference in this case scenario)
    tokens = oneTxt.split()

    for token in tokens:
        token = removeDigits(token.lower()) #getting rid of digits
        #token = token.strip() #removing any leading or tariling spaces(some words might have theme since )
#cleaning the tokenized words from any stopwords/noise and empty strings resulted from using split
        if token != '' and token not in stopWords: 
            njd.append(token) #grouping the tokenized and cleaned words into list
    return njd

In [13]:
#counting the occurrences of each word in the set of job descriptions
def occurrences(tokens):
    freq = {}
    for token in tokens:
        freq[token] = tokens.count(token)
    return freq

In [14]:
#claculating the term frequency–inverse document frequency score for each term present in the job descriptions
def tfidf(term,jds):
    tfs = [jd.count(term) / len(jd) for jd in jds] #list of term frequency values in each job description
    df = sum(term in jd for jd in jds) #document frequency of the term
    idf = math.log(len(jds) / (1 + df)) #inverse document frequency of the term
    tfIdf = [tf * idf for tf in tfs] #tf-idf score of the term
    return sum(tfIdf)

In [15]:
#calculating threshold value and filtering out terms with a tf-idf score equal and lower than the threshold
def thresholdFilter(scores):
    mean = statistics.mean(set(scores.values()))
    std = statistics.stdev(set(scores.values()))
#setting a threshold value at two standard deviations above the mean(the std multiplier is chosen by experimenting with different values)
    for key,val in list(scores.items()):
        thresholdScore = mean + 2*std 
        if val < thresholdScore:
                scores.pop(key)
    return scores

In [16]:
#generating ngrams
def ngrams(tokonizedJdescs):
    n = 1
    totNgrams = {}
    ngrams = []
    while True:
        ngrams = list((' '.join(tokonizedJdescs[i:i+n]) for i in range(len(tokonizedJdescs) - n + 1))) 
        totNgrams[str(n)+'grams'] = occurrences(ngrams) #using the number of occurrences to set the n at which it should stop
        if len(ngrams)/len(occurrences(ngrams).keys()) > 1: #generating ngrams until the occurrence of each ngram is 1
            n += 1
            ngrams = []
        else:
            break
#deleting the last set of ngrams because they will all have an occurrence of 1
    del totNgrams[list(totNgrams.keys())[-1]] 
    return totNgrams

In [89]:
#lemmatizer
def verbLemmatizer(tokonizedJdescs,JobDescriptions):
#importing TDK's(Turkish Language Association) dictionary words from Necmettin Çarkacı's TDKDictionaryCrawler project(link: https://github.com/ncarkaci/TDKDictionaryCrawler)
    with open('./TDK_Sözlük_Kelime_Listesi.txt', 'r', encoding='utf-8') as tdkDic:
        tdkWords = tdkDic.read().split()
#filtering out non verbal words
#all infinitive Turkish verbs end with 'mek' or 'mak' but not all words that end with 'mak' or 'mek' are verbs, that's why I had to create a list of those exceptions from the words I know and the ones I was able to find
    tdkVerbs = [word.lower() for word in set(tdkWords) if (word.endswith(('mek','mak')) and word not in['kaymak','ekmek','emek','yemek','ahmak', 'yamak', 'sümek', 'hamak', 'somak', 'temek', 'pomak', 'temek', 'mamak', 'sumak', 'ramak', 'kuymak', 'yumak'])] 
    tdkVerbroots = [verb[:-3] for verb in set(tdkVerbs)] #getting the roots of verbs found in the dictionary
#importing an AI generated alphabetically sorted list of Turkish verbal suffixes
    with open('./TR_verbal_suffixes.txt','r', encoding='utf-8') as suffixesFile:
        suffixes = suffixesFile.read().split()
#identifying potential verbs from the words extracted from job descriptions by looking verbal suffixes to filter them out
    potentialVerbs = []
    for word in set(tokonizedJdescs):
        for suffix in set(suffixes):
            if suffix in word:
                potentialVerbs.append(word)
                break
#finding roots of verbs present in job descriptions
    roots = {}
    for word in potentialVerbs:
        for root in tdkVerbroots:
            if root == word[:len(root)]: #conjugated verbs in the Turkish language always have suffixes added after the verb root
                roots[word] = root #no break because some roots are ranked before other roots that might be more relevant
#sorting verbs based on their tf-idf scores
    verbs ={}
    for word in roots.keys():
        if tdkVerbs[tdkVerbroots.index(roots[word])] in verbs:
            verbs[tdkVerbs[tdkVerbroots.index(roots[word])]] += tfidf(word,JobDescriptions)
        else:
            verbs[tdkVerbs[tdkVerbroots.index(roots[word])]] = tfidf(word,JobDescriptions)
    verbs = dict(sorted(verbs.items(), key = lambda x:x[1], reverse = True))
    verbs = {key:val for key,val in verbs.items() if val} #filtering out verbs with a score of 0.0
    verbs = thresholdFilter(verbs) #filtering out verbs that are below the thereshold
    return verbs

# Main program

In [50]:
#job listings urls 
title = 'grafik tasarımcı'
pages = getTotalpages(title)
jobListings = jobUrls_kariyer(title,pages)
if not jobListings:
    print('Aradiginiz is ilani bulunamadi')

In [51]:
jobDescs = jobDescriptions(jobListings) #job descriptions

In [52]:
tokonizedcleanJdescs = preprocess(jobDescs) #tokenized and cleaned job descriptions

In [53]:
allNgrams = ngrams(tokonizedcleanJdescs) #all possible ngrams from 1 to n

In [54]:
ngramsASkeys = list(allNgrams.keys())

In [55]:
for ngram in ngramsASkeys:
    for key in allNgrams[ngram]:
        allNgrams[ngram][key] = tfidf(key,jobDescs) #tf-idf scores of all ngrams

In [56]:
for ngram in ngramsASkeys:
    if len(set(allNgrams[ngram].values())) < 2:
        del allNgrams[ngram]
    else:
        allNgrams[ngram] = thresholdFilter(allNgrams[ngram])
allNgrams = {key:val for key,val in allNgrams.items() if val} #ngrams that are above the threshold score

In [57]:
ngramsASkeys = list(allNgrams.keys()) #keys of ngrams for easier navigation

In [58]:
keywords = dict(sorted(allNgrams[ngramsASkeys[0]].items(), key = lambda x:x[1], reverse = True)) #sorted unigrams
keyterms = dict(sorted(allNgrams[ngramsASkeys[1]].items(), key = lambda x:x[1], reverse = True)) #sorted bigrams
multigrams = [allNgrams[ngramsASkeys[i]] for i in range(2,len(ngramsASkeys))] #the rest of the ngrams
keyphrases = {}
for dic in multigrams:
    for key, val in dic.items():
        keyphrases[key] = val
keyphrases = dict(sorted(keyphrases.items(), key = lambda x:x[1], reverse = True)) #sorted phrases

In [27]:
#verbs present in job descriptions and brought to their infinitive form
keyverbs = verbLemmatizer(tokonizedcleanJdescs,jobDescs) 

# Program output

In [None]:
keywords

In [None]:
keyterms

In [None]:
keyphrases

In [None]:
keyverbs