In [90]:
import os
import subprocess
from typing import Dict, Any, Callable
# from typing import 

from dotenv import load_dotenv
import json
import requests

import pandas as pd
import datetime

# API Response Structure
 "id": Unique Job ID,
 
 "url": Job link,
 
 "jobTitle": Job title,
 
 "companyName": Company name,
 
 "companyLogo": Company logo link,
 
 "jobIndustry": Job function (industry),
 
 "jobType": Job type (full-time, contract, part-time or internship)
 
 "jobGeo": Geographic restriction for employment (or Anywhere if not applicable), 
 
 "jobLevel": Seniority level (or Any if not applicable), 
 
 "jobExcerpt": Excerpt job description (max 55 characters), 
 
 "jobDescription": Full job description (HTML), 
 
 "pubDate": Publication date and time (UTC+00:00),
 
 "annualSalaryMin": Annual min salary (if applicable),
 
 "annualSalaryMax": Annual max salary (if applicable),
 
 "salaryCurrency": ISO 4217 salary currency code (if applicable)


In [91]:
def dashed(s: str) -> str:
    return "-".join([word.lower() for word in s.split()])    

def _get_url( 
    count: int,
    industry: str,
    geo: str, 
    tag: str,
) -> str:
    
    url = "https://jobicy.com/api/v2/remote-jobs?"
    url = url + "&count=" + str(count) if count is not None else url
    url = url + "&industry=" + dashed(industry) if industry is not None else url
    url = url + "&geo=" + dashed(geo) if geo is not None else url
    url = url + "&tag=" + dashed(tag) if tag is not None else url

    return url

def get_job_postings(
    count = None,
    industry = None,
    geo = None,
    tag = None,
):
    url = _get_url(count=count, industry=industry, geo=geo, tag=tag)
    print(url)
    response = requests.get(url)
    print(type(response))
    response.raise_for_status()
    json_response = json.loads(response.content)
    return json_response

In [92]:
response = get_job_postings()

https://jobicy.com/api/v2/remote-jobs?
<class 'requests.models.Response'>


In [93]:
response.keys()

dict_keys(['apiVersion', 'documentationUrl', 'friendlyNotice', 'jobCount', 'xRayHash', 'clientKey', 'lastUpdate', 'jobs'])

In [94]:
response['jobs'][0]

{'id': 113804,
 'url': 'https://jobicy.com/jobs/113804-telephone-account-manager-b2b-czechia',
 'jobSlug': '113804-telephone-account-manager-b2b-czechia',
 'jobTitle': 'Telephone Account Manager B2B Czechia',
 'companyName': 'Foneday',
 'companyLogo': 'https://jobicy.com/data/server-nyc0409/galaxy/mercury/2024/12/584d093b-221.jpeg',
 'jobIndustry': ['Sales'],
 'jobType': ['full-time'],
 'jobGeo': 'Czechia',
 'jobLevel': 'Entry-Level',
 'jobExcerpt': 'Are you ready to join a dynamic, fast-growing company where your contribution truly matters? Foneday is on a mission to revolutionize the smartphone repair parts industry by building strong, lasting relationships with our customers. We&#8217;re looking for an energetic and results-driven Telephone Account Manager B2B &#8211; Czechia [Remote] to join our team, focusing on the&#8230;',
 'jobDescription': '<p>Are you ready to join a dynamic, fast-growing company where your contribution truly matters? Foneday is on a mission to revolutionize t

# 2. Queries
## 2.1 Industry = "Data Science"

In [95]:
response = get_job_postings(industry = "Data Science")
len(response['jobs'])

https://jobicy.com/api/v2/remote-jobs?&industry=data-science
<class 'requests.models.Response'>


13

In [96]:
response.keys()

dict_keys(['apiVersion', 'documentationUrl', 'friendlyNotice', 'jobCount', 'xRayHash', 'clientKey', 'lastUpdate', 'jobs'])

In [97]:
response['jobCount']

13

In [98]:
response['jobs'][0]

{'id': 111366,
 'url': 'https://jobicy.com/jobs/111366-research-scientist-3',
 'jobSlug': '111366-research-scientist-3',
 'jobTitle': 'Research Scientist',
 'companyName': 'Pearson',
 'companyLogo': 'https://jobicy.com/data/server-nyc0409/galaxy/mercury/2021/09/c80907b6d4b64a0bd9295e304bf1ac8a.jpg',
 'jobIndustry': ['Data Science'],
 'jobType': ['full-time'],
 'jobGeo': 'USA',
 'jobLevel': 'Any',
 'jobExcerpt': 'The Research Scientist position described here is part of the Pearson Psychometric and Research Services group, which is part of the US School Assessment group and larger Assessment and Qualifications team. Pearson research scientists provide the full range of psychometric support to state and national customers, develop innovative techniques and processes, and conduct cutting-edge research&#8230;',
 'jobDescription': '<p>The Research Scientist position described here is part of the Pearson Psychometric and Research Services group, which is part of the US School Assessment grou

# 3. JSON to DataFrame transformation

In [99]:
# from pandas.io.json import json_normalize

response = get_job_postings(industry = "Data Science")
df = pd.json_normalize(response['jobs'])

df.head()

https://jobicy.com/api/v2/remote-jobs?&industry=data-science
<class 'requests.models.Response'>


Unnamed: 0,id,url,jobSlug,jobTitle,companyName,companyLogo,jobIndustry,jobType,jobGeo,jobLevel,jobExcerpt,jobDescription,pubDate,annualSalaryMin,annualSalaryMax,salaryCurrency
0,111366,https://jobicy.com/jobs/111366-research-scient...,111366-research-scientist-3,Research Scientist,Pearson,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,The Research Scientist position described here...,<p>The Research Scientist position described h...,2024-12-08 04:38:17,115000.0,140000.0,USD
1,74847,https://jobicy.com/jobs/74847-research-scienti...,74847-research-scientist-2,Research Scientist (Voice AI Foundations),Deepgram,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,Despite the proliferation of text-based commun...,<p>Despite the proliferation of text-based com...,2024-12-07 04:03:02,150000.0,220000.0,USD
2,111272,https://jobicy.com/jobs/111272-data-scientist-4,111272-data-scientist-4,Data Scientist,Pinterest,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,As a Data Scientist you will shape the future ...,<p>As a Data Scientist you will shape the futu...,2024-12-04 04:06:22,101382.0,209296.0,USD
3,112126,https://jobicy.com/jobs/112126-data-scientist-...,112126-data-scientist-growth,Data Scientist - Growth,Eight Sleep,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],"LATAM, Canada, Europe, UK, USA",Any,Eight Sleep is the world’s first sleep fitness...,<p>Eight Sleep is the world’s first sleep fitn...,2024-12-04 03:48:24,,,
4,94818,https://jobicy.com/jobs/94818-senior-data-scie...,94818-senior-data-scientist-10,Senior Data Scientist,Coinbase,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Senior,Ready to be pushed beyond what you think you’r...,<p>Ready to be pushed beyond what you think yo...,2024-12-02 04:47:39,175100.0,206000.0,USD


# 4. Data Cleaning

# 4.1 Removing Null Values

In [13]:
df.isna().sum(axis=0)

id                 0
url                0
jobSlug            0
jobTitle           0
companyName        0
companyLogo        0
jobIndustry        0
jobType            0
jobGeo             0
jobLevel           0
jobExcerpt         0
jobDescription     0
pubDate            0
annualSalaryMin    6
annualSalaryMax    6
salaryCurrency     6
dtype: int64

In [15]:
df['salaryAvailable'] == df['annualSalaryMin'].isna()

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
dtype: bool

In [25]:
df = df.drop('salaryAvailable', axis=1)
df.head(5)

KeyError: "['salaryAvailable'] not found in axis"

In [28]:
df['salaryAvailable'] = (df.loc[:, 'annualSalaryMin'].isna() + df.loc[:, 'annualSalaryMax'].isna()) == 0
df.head()

Unnamed: 0,id,url,jobSlug,jobTitle,companyName,companyLogo,jobIndustry,jobType,jobGeo,jobLevel,jobExcerpt,jobDescription,pubDate,annualSalaryMin,annualSalaryMax,salaryCurrency,salaryAvailable
0,111366.0,https://jobicy.com/jobs/111366-research-scient...,111366-research-scientist-3,Research Scientist,Pearson,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,The Research Scientist position described here...,<p>The Research Scientist position described h...,2024-12-08 04:38:17,115000.0,140000.0,USD,True
1,74847.0,https://jobicy.com/jobs/74847-research-scienti...,74847-research-scientist-2,Research Scientist (Voice AI Foundations),Deepgram,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,Despite the proliferation of text-based commun...,<p>Despite the proliferation of text-based com...,2024-12-07 04:03:02,150000.0,220000.0,USD,True
2,111272.0,https://jobicy.com/jobs/111272-data-scientist-4,111272-data-scientist-4,Data Scientist,Pinterest,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,As a Data Scientist you will shape the future ...,<p>As a Data Scientist you will shape the futu...,2024-12-04 04:06:22,101382.0,209296.0,USD,True
3,112126.0,https://jobicy.com/jobs/112126-data-scientist-...,112126-data-scientist-growth,Data Scientist - Growth,Eight Sleep,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],"LATAM, Canada, Europe, UK, USA",Any,Eight Sleep is the world’s first sleep fitness...,<p>Eight Sleep is the world’s first sleep fitn...,2024-12-04 03:48:24,,,,False
4,94818.0,https://jobicy.com/jobs/94818-senior-data-scie...,94818-senior-data-scientist-10,Senior Data Scientist,Coinbase,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Senior,Ready to be pushed beyond what you think you’r...,<p>Ready to be pushed beyond what you think yo...,2024-12-02 04:47:39,175100.0,206000.0,USD,True


In [33]:
df.loc[:, ['annualSalaryMin','annualSalaryMax']] = df.loc[:, ['annualSalaryMin','annualSalaryMax']].fillna(value=0, axis=0)
df.head()

Unnamed: 0,id,url,jobSlug,jobTitle,companyName,companyLogo,jobIndustry,jobType,jobGeo,jobLevel,jobExcerpt,jobDescription,pubDate,annualSalaryMin,annualSalaryMax,salaryCurrency,salaryAvailable
0,111366.0,https://jobicy.com/jobs/111366-research-scient...,111366-research-scientist-3,Research Scientist,Pearson,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,The Research Scientist position described here...,<p>The Research Scientist position described h...,2024-12-08 04:38:17,115000,140000,USD,True
1,74847.0,https://jobicy.com/jobs/74847-research-scienti...,74847-research-scientist-2,Research Scientist (Voice AI Foundations),Deepgram,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,Despite the proliferation of text-based commun...,<p>Despite the proliferation of text-based com...,2024-12-07 04:03:02,150000,220000,USD,True
2,111272.0,https://jobicy.com/jobs/111272-data-scientist-4,111272-data-scientist-4,Data Scientist,Pinterest,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Any,As a Data Scientist you will shape the future ...,<p>As a Data Scientist you will shape the futu...,2024-12-04 04:06:22,101382,209296,USD,True
3,112126.0,https://jobicy.com/jobs/112126-data-scientist-...,112126-data-scientist-growth,Data Scientist - Growth,Eight Sleep,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],"LATAM, Canada, Europe, UK, USA",Any,Eight Sleep is the world’s first sleep fitness...,<p>Eight Sleep is the world’s first sleep fitn...,2024-12-04 03:48:24,0,0,,False
4,94818.0,https://jobicy.com/jobs/94818-senior-data-scie...,94818-senior-data-scientist-10,Senior Data Scientist,Coinbase,https://jobicy.com/data/server-nyc0409/galaxy/...,[Data Science],[full-time],USA,Senior,Ready to be pushed beyond what you think you’r...,<p>Ready to be pushed beyond what you think yo...,2024-12-02 04:47:39,175100,206000,USD,True


In [107]:
# Full remove null values code
df['salaryMissing'] = (df.loc[:, 'annualSalaryMin'].isna() + df.loc[:, 'annualSalaryMax'].isna()) > 0
df.loc[:, ['annualSalaryMin','annualSalaryMax']] = df.loc[:, ['annualSalaryMin','annualSalaryMax']].fillna(value=0, axis=0)

# Typing
numeric_cols = ['annualSalaryMin', 'annualSalaryMax', 'salaryMissing']
for col in df.columns:
    if col in numeric_cols: continue
    df[col] = df[col].copy().astype('string')        
    
df['annualSalaryMin'] = df['annualSalaryMin'].copy().astype(int)
df['annualSalaryMax'] = df['annualSalaryMax'].copy().astype(int)
df['salaryMissing'] = df['salaryMissing'].copy().astype(bool)

In [103]:
df["jobTitle"] = df["jobTitle"].copy().astype(str)

In [108]:
df.dtypes

id                 string[python]
url                string[python]
jobSlug            string[python]
jobTitle           string[python]
companyName        string[python]
companyLogo        string[python]
jobIndustry        string[python]
jobType            string[python]
jobGeo             string[python]
jobLevel           string[python]
jobExcerpt         string[python]
jobDescription     string[python]
pubDate            string[python]
annualSalaryMin             int64
annualSalaryMax             int64
salaryCurrency     string[python]
salaryMissing                bool
dtype: object

In [106]:
df["jobTitle"].astype(str)

0                              Research Scientist
1       Research Scientist (Voice AI Foundations)
2                                  Data Scientist
3                         Data Scientist - Growth
4                           Senior Data Scientist
5           Senior Data Scientist (Psychometrics)
6                           Data Platform Manager
7                       Sr. Data Scientist - Core
8                           Senior Data Scientist
9                                  Data Scientist
10           Associate Enterprise Data Specialist
11                           Senior Data Engineer
12    Applied Scientist Intern, PhD - Winter 2025
Name: jobTitle, dtype: object

In [61]:
df['annualSalaryMin']

0     115000
1     150000
2     101382
3          0
4     175100
5          0
6     175000
7     123470
8     158000
9          0
10         0
11         0
12         0
Name: annualSalaryMin, dtype: object

In [62]:
df['annualSalaryMin'].index

RangeIndex(start=0, stop=13, step=1)

In [64]:
df.dtypes

id                  int64
url                object
jobSlug            object
jobTitle           object
companyName        object
companyLogo        object
jobIndustry        object
jobType            object
jobGeo             object
jobLevel           object
jobExcerpt         object
jobDescription     object
pubDate            object
annualSalaryMin    object
annualSalaryMax    object
salaryCurrency     object
salaryMissing        bool
dtype: object

In [65]:
df['annualSalaryMin'] = df['annualSalaryMin'].copy().astype(int)
df.dtypes

id                  int64
url                object
jobSlug            object
jobTitle           object
companyName        object
companyLogo        object
jobIndustry        object
jobType            object
jobGeo             object
jobLevel           object
jobExcerpt         object
jobDescription     object
pubDate            object
annualSalaryMin     int64
annualSalaryMax    object
salaryCurrency     object
salaryMissing        bool
dtype: object

## 4.1 Single element lists to items

In [9]:
sum([len(l) > 1 for l in df["jobIndustry"]])

0

In [10]:
df["jobIndustry"] = [l[0] for l in df["jobIndustry"]]

In [11]:
df.head()

Unnamed: 0,id,url,jobSlug,jobTitle,companyName,companyLogo,jobIndustry,jobType,jobGeo,jobLevel,jobExcerpt,jobDescription,pubDate,annualSalaryMin,annualSalaryMax,salaryCurrency
0,111366,https://jobicy.com/jobs/111366-research-scient...,111366-research-scientist-3,Research Scientist,Pearson,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,[full-time],USA,Any,The Research Scientist position described here...,<p>The Research Scientist position described h...,2024-12-08 04:38:17,115000.0,140000.0,USD
1,74847,https://jobicy.com/jobs/74847-research-scienti...,74847-research-scientist-2,Research Scientist (Voice AI Foundations),Deepgram,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,[full-time],USA,Any,Despite the proliferation of text-based commun...,<p>Despite the proliferation of text-based com...,2024-12-07 04:03:02,150000.0,220000.0,USD
2,111272,https://jobicy.com/jobs/111272-data-scientist-4,111272-data-scientist-4,Data Scientist,Pinterest,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,[full-time],USA,Any,As a Data Scientist you will shape the future ...,<p>As a Data Scientist you will shape the futu...,2024-12-04 04:06:22,101382.0,209296.0,USD
3,112126,https://jobicy.com/jobs/112126-data-scientist-...,112126-data-scientist-growth,Data Scientist - Growth,Eight Sleep,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,[full-time],"LATAM, Canada, Europe, UK, USA",Any,Eight Sleep is the world’s first sleep fitness...,<p>Eight Sleep is the world’s first sleep fitn...,2024-12-04 03:48:24,,,
4,94818,https://jobicy.com/jobs/94818-senior-data-scie...,94818-senior-data-scientist-10,Senior Data Scientist,Coinbase,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,[full-time],USA,Senior,Ready to be pushed beyond what you think you’r...,<p>Ready to be pushed beyond what you think yo...,2024-12-02 04:47:39,175100.0,206000.0,USD


In [12]:
sum([len(l) > 1 for l in df["jobType"]])

0

In [13]:
df["jobType"] = [l[0] for l in df["jobType"]]

In [14]:
df.head()

Unnamed: 0,id,url,jobSlug,jobTitle,companyName,companyLogo,jobIndustry,jobType,jobGeo,jobLevel,jobExcerpt,jobDescription,pubDate,annualSalaryMin,annualSalaryMax,salaryCurrency
0,111366,https://jobicy.com/jobs/111366-research-scient...,111366-research-scientist-3,Research Scientist,Pearson,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Any,The Research Scientist position described here...,<p>The Research Scientist position described h...,2024-12-08 04:38:17,115000.0,140000.0,USD
1,74847,https://jobicy.com/jobs/74847-research-scienti...,74847-research-scientist-2,Research Scientist (Voice AI Foundations),Deepgram,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Any,Despite the proliferation of text-based commun...,<p>Despite the proliferation of text-based com...,2024-12-07 04:03:02,150000.0,220000.0,USD
2,111272,https://jobicy.com/jobs/111272-data-scientist-4,111272-data-scientist-4,Data Scientist,Pinterest,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Any,As a Data Scientist you will shape the future ...,<p>As a Data Scientist you will shape the futu...,2024-12-04 04:06:22,101382.0,209296.0,USD
3,112126,https://jobicy.com/jobs/112126-data-scientist-...,112126-data-scientist-growth,Data Scientist - Growth,Eight Sleep,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,"LATAM, Canada, Europe, UK, USA",Any,Eight Sleep is the world’s first sleep fitness...,<p>Eight Sleep is the world’s first sleep fitn...,2024-12-04 03:48:24,,,
4,94818,https://jobicy.com/jobs/94818-senior-data-scie...,94818-senior-data-scientist-10,Senior Data Scientist,Coinbase,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Senior,Ready to be pushed beyond what you think you’r...,<p>Ready to be pushed beyond what you think yo...,2024-12-02 04:47:39,175100.0,206000.0,USD


## 4.2 Categorical to dummy variables

In [16]:
df["loadDate"] = datetime.date.today()
df.head()

Unnamed: 0,id,url,jobSlug,jobTitle,companyName,companyLogo,jobIndustry,jobType,jobGeo,jobLevel,jobExcerpt,jobDescription,pubDate,annualSalaryMin,annualSalaryMax,salaryCurrency,loadDate
0,111366,https://jobicy.com/jobs/111366-research-scient...,111366-research-scientist-3,Research Scientist,Pearson,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Any,The Research Scientist position described here...,<p>The Research Scientist position described h...,2024-12-08 04:38:17,115000.0,140000.0,USD,2024-12-10
1,74847,https://jobicy.com/jobs/74847-research-scienti...,74847-research-scientist-2,Research Scientist (Voice AI Foundations),Deepgram,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Any,Despite the proliferation of text-based commun...,<p>Despite the proliferation of text-based com...,2024-12-07 04:03:02,150000.0,220000.0,USD,2024-12-10
2,111272,https://jobicy.com/jobs/111272-data-scientist-4,111272-data-scientist-4,Data Scientist,Pinterest,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Any,As a Data Scientist you will shape the future ...,<p>As a Data Scientist you will shape the futu...,2024-12-04 04:06:22,101382.0,209296.0,USD,2024-12-10
3,112126,https://jobicy.com/jobs/112126-data-scientist-...,112126-data-scientist-growth,Data Scientist - Growth,Eight Sleep,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,"LATAM, Canada, Europe, UK, USA",Any,Eight Sleep is the world’s first sleep fitness...,<p>Eight Sleep is the world’s first sleep fitn...,2024-12-04 03:48:24,,,,2024-12-10
4,94818,https://jobicy.com/jobs/94818-senior-data-scie...,94818-senior-data-scientist-10,Senior Data Scientist,Coinbase,https://jobicy.com/data/server-nyc0409/galaxy/...,Data Science,full-time,USA,Senior,Ready to be pushed beyond what you think you’r...,<p>Ready to be pushed beyond what you think yo...,2024-12-02 04:47:39,175100.0,206000.0,USD,2024-12-10
