<img src="img/logo.png">

# Chargement des librairies

In [1]:
import pandas as pd
import requests

# Récupération des données via l'API de Justjoin

In [2]:
http_address = 'https://justjoin.it/api/offers'

response = requests.get(http_address)
content = response.json()

In [3]:
print(f'Downloaded {len(content)} offers')

Downloaded 15389 offers


In [4]:
content[0]

{'title': '.NET Developer',
 'street': 'Centrum',
 'city': 'Bydgoszcz',
 'country_code': 'PL',
 'address_text': 'Centrum, Bydgoszcz',
 'marker_icon': 'net',
 'workplace_type': 'partly_remote',
 'company_name': 'Oponeo.pl',
 'company_url': 'https://www.oponeo.pl/',
 'company_size': '>200',
 'experience_level': 'mid',
 'latitude': '53.1234804',
 'longitude': '18.0084378',
 'published_at': '2023-03-28T23:00:10.846Z',
 'remote_interview': False,
 'open_to_hire_ukrainians': False,
 'id': 'oponeo-pl-net-developer-bydgoszcz',
 'display_offer': True,
 'employment_types': [{'type': 'b2b',
   'salary': {'from': 10000, 'to': 17000, 'currency': 'pln'}},
  {'type': 'permanent',
   'salary': {'from': 8000, 'to': 14000, 'currency': 'pln'}}],
 'company_logo_url': 'https://bucket.justjoin.it/offers/company_logos/thumb/b220d7875436b8b178a07eb2446062ad1486047c.jpg?1677623327',
 'skills': [{'name': 'ASP.NET', 'level': 3},
  {'name': '.Net', 'level': 3},
  {'name': 'C#', 'level': 3}],
 'remote': False,
 'm

# Transformation des données

In [5]:
# Conversion des monnaies en euros
pln = 0.213
usd = 0.923

# Préparation des données pour la création du DataFrame
records = []
offer_types = ['data']

for offer in content:
    if offer['marker_icon'] in offer_types:
        for employment_type in offer['employment_types']:
            if employment_type['type'] == 'b2b' and employment_type['salary'] is not None:
                if employment_type['salary']['currency'] == 'pln':
                    b2b_from = employment_type['salary']['from'] * pln
                    b2b_to = employment_type['salary']['to'] * pln
                elif employment_type['salary']['currency'] == 'usd':
                    b2b_from = employment_type['salary']['from'] * usd
                    b2b_to = employment_type['salary']['to'] * usd
                else:
                    b2b_from = employment_type['salary']['from']
                    b2b_to = employment_type['salary']['to']
            else:
                b2b_from = None
                b2b_to = None
                
            if employment_type['type'] == 'permanent' and employment_type['salary'] is not None:
                if employment_type['salary']['currency'] == 'pln':
                    permanent_from = employment_type['salary']['from'] * pln
                    permanent_to = employment_type['salary']['to'] * pln
                elif employment_type['salary']['currency'] == 'usd':
                    permanent_from = employment_type['salary']['from'] * usd
                    permanent_to = employment_type['salary']['to'] * usd
                else:
                    permanent_from = employment_type['salary']['from']
                    permanent_to = employment_type['salary']['to']
            else:
                permanent_from = None
                permanent_to = None
                
        for skill in offer['skills']:
            #print(offer['title'])
            records.append({'offer_type': offer['marker_icon'],
                           'title': offer['title'],
                           'id': offer['id'],
                           'b2b_from': b2b_from,
                           'b2b_to': b2b_to,
                           'skill_name': skill['name'],
                           'skill_level': skill['level'],
                           'permanent_from': permanent_from,
                           'permanent_to': permanent_to,
                           'experience_level': offer['experience_level']})
    

# Création du DataFrame

In [6]:
df = pd.DataFrame(records)
df.sample(5)

Unnamed: 0,offer_type,title,id,b2b_from,b2b_to,skill_name,skill_level,permanent_from,permanent_to,experience_level
749,data,Specjalista Event Stream Processing,devire-specjalista-event-stream-processing,5009.76,6441.12,SQL,3,,,mid
1550,data,(Senior) Data Engineer,limango-polska-data-engineer-katowice,3621.0,5538.0,ETL,4,,,senior
473,data,Data Consultant,netcompany-poland-data-consultant,,,Microsoft SQL,3,1917.0,2982.0,mid
144,data,Integration & Data Engineer,reply-integration-data-engineer-gliwice,,,REST API,3,1917.0,3408.0,mid
725,data,Python Data Scientist,transition-technologies-science-python-data-sc...,,,Data Science,3,1810.5,2662.5,mid


In [7]:
df['skill_name'].value_counts()

SQL              313
Python           241
ETL              101
AWS              101
Big Data          75
                ... 
Azure/AWS          1
TypeScript         1
SAS 4 GL           1
MDM                1
DWH/Data Mart      1
Name: skill_name, Length: 233, dtype: int64

In [8]:
# Affichage des 20 skills les plus demandés
top_skills = df.groupby(['skill_name'])['id'].nunique().sort_values(ascending=False).head(20).index
top_skills

Index(['SQL', 'Python', 'AWS', 'ETL', 'Big Data', 'T-SQL', 'Oracle',
       'Power BI', 'Data Science', 'Hadoop', 'PL/SQL', 'English', 'Azure',
       'Snowflake', 'Databases', 'Cloud', 'Spark', 'Machine Learning', 'Data',
       'Scala'],
      dtype='object', name='skill_name')

In [9]:
# Récupération des annonces contenant uniquement les 20 skills les plus demandés
df = df[df['skill_name'].isin(top_skills)]
df.sample(5)

Unnamed: 0,offer_type,title,id,b2b_from,b2b_to,skill_name,skill_level,permanent_from,permanent_to,experience_level
443,data,Pl/SQL Developer (MID),7n-pl-sql-developer-mid,4114.734,4651.707,PL/SQL,3,,,mid
631,data,Data Engineer,addepto-data-engineer-84907540-d2ee-4c5f-9a14-...,4294.08,6441.12,Python,4,,,senior
1260,data,BI Developer (OBIEE),onwelo-sp-z-o-o-bi-developer-obiee-gdansk,,,SQL,3,2023.5,2982.0,mid
1125,data,DATA System Engineer,payback-data-system-engineer-pruszkow,,,Hadoop,4,,,mid
2591,data,Data Quality Tester,idego-data-quality-tester-torun,4260.0,5325.0,Power BI,3,,,senior


# Pivot

In [10]:
# Pivotage des 'skill_name' en colonnes
df = df.pivot(index = df.columns.drop(['skill_name', 'skill_level']),
         columns = 'skill_name',
         values = 'skill_level').fillna(0).reset_index()
df.sample(5)

skill_name,offer_type,title,id,b2b_from,b2b_to,permanent_from,permanent_to,experience_level,AWS,Azure,...,Machine Learning,Oracle,PL/SQL,Power BI,Python,SQL,Scala,Snowflake,Spark,T-SQL
544,data,Mid MySQL Database Administration,cred-mid-mysql-database-administration-poznan,4899.0,7029.0,,,mid,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
499,data,Jr/Mid Data Engineer,awareson-sp-z-o-o-data-engineer-warszawa,,,1917.0,3834.0,mid,0.0,4.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
385,data,DataBase Application (Developer),aspire-global-database-application-developer,,,,,mid,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
250,data,Data Engineer (Cloud Engineer),peakdata-data-enginner-cloud-engineer-katowice,5544.0,7224.0,,,senior,4.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
647,data,SPECJALISTA DS. BUSINESS INTELLIGENCE,drosed-holding-specjalista-ds-business-intelli...,,,,,mid,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print(f'\nLe DataFrame final contient {df.shape[0]} observations et {df.shape[1]} colonnes')


Le DataFrame final contient 859 observations et 28 colonnes


# Exportation du fichier

In [12]:
df.to_csv('justjoin.csv', index=False)