In [None]:
import re
import csv
from jobspy import scrape_jobs
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import PhraseMatcher

from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor

from collections import Counter
import math

In [None]:

jobs = scrape_jobs(
    site_name=["indeed", "linkedin"],
    search_term="Data Engineer",
    location="Cairo, Egypt",
    results_wanted=20,
    hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
    country_indeed='Egypt',  # only needed for indeed / glassdoor
    
    # linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
    
)
print(f"Found {len(jobs)} jobs")
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel

In [None]:
print(jobs.loc[1].description)

In [None]:
def is_valid_description(description):
    return description is not None and not (isinstance(description, float) and math.isnan(description))

In [None]:
nlpp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlpp, SKILL_DB, PhraseMatcher)
skill_counter = Counter()

for index, row in jobs.iterrows():
    description = row['description']

    if not is_valid_description(description):
        continue

    annotations = skill_extractor.annotate(description)
    for item in annotations['results']['ngram_scored']:
     skill = item['doc_node_value'].lower()
     score = item['score']
     skill_counter[skill] += math.floor(score)

skill_counter

In [None]:
skill_counter.most_common()

In [None]:




custom_stopwords = {"knowledge", "comment", "degree", "experience", "good", "must", "solid", 
                    "understanding", "years", "working", "teams", "projects", "results", 
                    "responsibilities", "comments", "develop", "engineering", "systems", "tools",
                    "development", "standards", "value", "cost", "generate", "needed", "documentation", "our", "your"
                    "diagrams", "analyse", "business", "performance", "security", "methodology", "multidisciplinary areas", "all",
                    "plus", "influence", "overall", "a must", "the standards", "all levels", "the", "why", "master", "customers", "a ", "national", "veteran"}

stopwords = STOP_WORDS.union(custom_stopwords)
