In [1]:
import pickle
import pandas as pd
from bs4 import BeautifulSoup as bs
import re

In [2]:
with open('jobs_linkedin_loads.pickle', 'rb') as handle:
    jobs = pickle.load(handle)

len(jobs)

12777

In [3]:
jobs['2912844894']

{'id': '2912844894',
 'description': '<div class="description__text description__text--rich">\n<section class="show-more-less-html" data-max-lines="5">\n<div class="show-more-less-html__markup show-more-less-html__markup--clamp-after-5">\n        Project Canary is a growth-stage, SaaS and certification company combatting climate-change from an incredible vantage point that can impact oil &amp; gas, utilities, landfills, and ag. We are a Public Benefit Corp (B-Corp rating score 107) that helps monitor and mitigate emissions in the ESG landscape favored by communities and investors alike through independent data tied to carbon and environmental footprints. With flexible work environments in Denver,<br/><br/>Project Canary’s mission is to make net-zero a reality by quantifying climate change and putting actionable insights into the hands of the energy sector. Our diverse and inclusive team of operators, scientists, engineers, and sales leaders know how to network, hustle, and are change-m

In [4]:
results = {}

for key, value in jobs.items():
    info = value['companyInfo'].split('|')
    elements_count = len(info)

    if not elements_count in results:
        results[elements_count] = 1
    else:
        results[elements_count] += 1

results

{2: 12762, 3: 12, 4: 2, 1: 1}

In [5]:
def experience_level(job):
    title = str.lower(job['title'])
    if any(substring in title for substring in ["senior", "lead", "principal"]):
        return "senior"
    if any(substring in title for substring in ["junior", "intern", "grad"]):
        return "junior"

    description = str.lower(job['description'])
    if bool(re.search('([5678]\+ years|[5678] years)', description)):
        return "senior"

    return "mid"

In [6]:
experience_level({"title": "Senior Data Scientist"})
experience_level({"title": "Data Scientist - Intern"})
experience_level({"title": "Data Scientist", "description": "blah blah 6+ years experience"})

'senior'

In [7]:
def is_relevant(job):
    title_pattern = "(?:data scientist|data science|machine learning)"
    return bool(re.search(title_pattern, job['title'], re.IGNORECASE))

In [8]:
is_relevant({"title": "Senior Data Scientist"})
is_relevant({"title": "Software - Intern"})
is_relevant({"title": "Engineer Machine Learning", "description": "blah blah 7 years experience"})

True

In [9]:
pattern_start = "[Rr]esponsibilities|[Ww]hat [Yy]ou['’]ll|[Ww]hat [Yy]ou [Ww]ill|[Dd]uties|[Tt]he [Rr]ole.{0,10}\||[Oo]verview|[Ww]ork.{0,10}\|"

pattern_end = "Requirements|[Qq]ualifications|Skills.{0,10}\||[Ll]ooking [Ff]or.{0,5}\||[Yy]ou [Hh]ave:"

pattern = f"(?:{pattern_start})(.*?)(?:{pattern_end})"

In [10]:
jobs_enhanced = {}

In [11]:
counter = 0

for key, value in jobs.items():
    description = (value['description'])
    soup = bs(description)

    text = soup.get_text("|", strip=True)
    match = re.findall(pattern, text)

    soup.button.decompose()
    soup.button.decompose()

    if is_relevant(value) and match and len(match[0]) > 130:
        company_info = value['companyInfo'].split('|')
        company_name = company_info[0] if len(company_info) > 0 else ""
        location = company_info[1] if len(company_info) > 1 else ""

        responsibilities = match[0].replace("|", " ").replace("\t", " ")

        jobs_enhanced[key] = {
            'responsibilities' : responsibilities,
            'title' : value['title'],
            'description': str(soup),
            'companyName': company_name,
            'location': location,
            'level': experience_level(value)
        }
        counter += 1


    # if (counter > 5):
    #     break

In [12]:
list(jobs_enhanced.items())[:2]

[('2908496770',
  {'responsibilities': ': • Design and implement ML methods on proprietary and open-access datasets; • Utilize large-scale datasets to generate statistically motivated research hypotheses; • Apply statistical methods to rigorously test and evaluate research hypotheses; • Develop and foster external collaborations; • Provide expert technical guidance and support customers in the design and analysis of experiments; • Work both independently and as part of a collaborative team to develop data analysis and machine learning solutions. ',
   'title': 'Data Scientist',
   'description': '<html><body><div class="description__text description__text--rich">\n<section class="show-more-less-html" data-max-lines="5">\n<div class="show-more-less-html__markup show-more-less-html__markup--clamp-after-5">\n        I am looking for a self-motivated data scientist with machine learning experience. You will be working on a ground-breaking cloud R&amp;D platform designed to integrate the vo

In [13]:
counter, print_counter = 0, 0

for key, value in jobs_enhanced.items():
    if counter%150 == 0 :
        print_counter +=1
        print(f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{key} (Count: {print_counter}. Index: {counter})")
        print(value['responsibilities'])
        print("\n")

    counter += 1


https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/2908496770 (Count: 1. Index: 0)
: • Design and implement ML methods on proprietary and open-access datasets; • Utilize large-scale datasets to generate statistically motivated research hypotheses; • Apply statistical methods to rigorously test and evaluate research hypotheses; • Develop and foster external collaborations; • Provide expert technical guidance and support customers in the design and analysis of experiments; • Work both independently and as part of a collaborative team to develop data analysis and machine learning solutions. 


https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/2890624722 (Count: 2. Index: 150)
 We’re looking for a smart, passionate, team-oriented Solutions Data Scientist to support our customers in translating climate forecasts into actionable insights that matter for their business.This person’s primary focus will be serving as the bridge between our customers and the data science team: underst

In [14]:
len(jobs_enhanced)

6344

In [15]:
# short_count = 0
#
# for key, value in jobs_enhanced.items():
#     if len(value) < 75:
#         short_count +=1
#
# short_count

In [16]:
df = pd.DataFrame.from_dict(jobs_enhanced, orient='index')
df.head()

Unnamed: 0,responsibilities,title,description,companyName,location,level
2908496770,: • Design and implement ML methods on proprie...,Data Scientist,"<html><body><div class=""description__text desc...",Data Revolution,San Francisco Bay Area,senior
2911267267,: • Provide consultative support as and when r...,Data Scientist,"<html><body><div class=""description__text desc...",TrueSkilla,United States,senior
2912844894,: Support Data and Analytics team through deve...,Data Scientist,"<html><body><div class=""description__text desc...",Project Canary,"Denver, CO",mid
2911205495,Build agent-based simulations of smart contra...,Data Scientist,"<html><body><div class=""description__text desc...",Gauntlet,United States,mid
2912480226,"Productionize, launch, and monitor predictive...",Data Scientist,"<html><body><div class=""description__text desc...",Miles,"Redwood City, CA",mid


In [17]:
df.shape

(6344, 6)

In [18]:
df2 = df.drop_duplicates()

In [19]:
df2.shape

(6193, 6)

In [20]:
df3 = df2.drop_duplicates(subset=['responsibilities'])

In [21]:
df3.shape

(3829, 6)

In [22]:
df3.to_pickle('big_job_df.pickle')

In [29]:
df3.describe()

Unnamed: 0,responsibilities,title,description,companyName,location,level
count,3829,3829,3829,3829,3829,3829
unique,3829,1877,3829,1804,573,3
top,: • Design and implement ML methods on proprie...,Data Scientist,"<html><body><div class=""description__text desc...",Amazon,"London, England, United Kingdom",mid
freq,1,843,1,185,241,1982


In [26]:
def cell_count_containing(series, term):
    count = series.str.contains(term, case=False).sum()
    percent = count / len(series) * 100
    return f"{term}: {count}  ({percent:.1f}%)"

In [27]:
terms = ['data', 'machine', 'machine learning', 'walmart', 'amazon', 'microsoft', 'jpmorgan', 'vaccine']

In [28]:
[cell_count_containing(df2.responsibilities, t) for t in terms]

['data: 5575  (90.0%)',
 'machine: 3396  (54.8%)',
 'machine learning: 3315  (53.5%)',
 'walmart: 109  (1.8%)',
 'amazon: 184  (3.0%)',
 'microsoft: 162  (2.6%)',
 'jpmorgan: 16  (0.3%)',
 'vaccine: 28  (0.5%)']

In [36]:
sum([len(d.split(' ')) for d in df3.responsibilities]) / len(df3.responsibilities)

197.9673544006268

In [34]:
len(df3.responsibilities)

3829

In [37]:
print(pattern)

(?:[Rr]esponsibilities|[Ww]hat [Yy]ou['’]ll|[Ww]hat [Yy]ou [Ww]ill|[Dd]uties|[Tt]he [Rr]ole.{0,10}\||[Oo]verview|[Ww]ork.{0,10}\|)(.*?)(?:Requirements|[Qq]ualifications|Skills.{0,10}\||[Ll]ooking [Ff]or.{0,5}\||[Yy]ou [Hh]ave:)
