<img src="logo2.png">

# Chargement des librairies

In [1]:
import pandas as pd
import requests

# Récupération des données via l'API de Justjoin

In [2]:
http_address = 'https://justjoin.it/api/offers'

response = requests.get(http_address)
content = response.json()

In [3]:
print(f'Downloaded {len(content)} offers')

Downloaded 15183 offers


In [4]:
content[0]

{'title': 'Architekt systemów informatycznych',
 'street': 'Puławska 474',
 'city': 'Warszawa',
 'country_code': 'PL',
 'address_text': 'Puławska 474, Warszawa',
 'marker_icon': 'architecture',
 'workplace_type': 'partly_remote',
 'company_name': 'Comtegra',
 'company_url': 'https://www.comtegra.pl/kariera/',
 'company_size': '100',
 'experience_level': 'mid',
 'latitude': '52.1185376',
 'longitude': '21.0175693',
 'published_at': '2023-03-28T20:00:10.781Z',
 'remote_interview': True,
 'open_to_hire_ukrainians': True,
 'id': 'comtegra-architekt-systemow-informatycznych-b1c43113-6ba7-4dd6-b1a7-24ed52f73539',
 'display_offer': True,
 'employment_types': [{'type': 'permanent', 'salary': None},
  {'type': 'b2b', 'salary': None}],
 'company_logo_url': 'https://bucket.justjoin.it/offers/company_logos/thumb/362a5a8b3941b1d082803829ad08e8591043aec9.jpg?1676974614',
 'skills': [{'name': 'BPMN', 'level': 3},
  {'name': 'UML', 'level': 3},
  {'name': 'Architecture', 'level': 4}],
 'remote': False

# Transformation des données

In [5]:
# Conversion des monnaies en euros
pln = 0.213
usd = 0.923

# Préparation des données pour la création du DataFrame
records = []
offer_types = ['data']

for offer in content:
    if offer['marker_icon'] in offer_types:
        for employment_type in offer['employment_types']:
            if employment_type['type'] == 'b2b' and employment_type['salary'] is not None:
                if employment_type['salary']['currency'] == 'pln':
                    b2b_from = employment_type['salary']['from'] * pln
                    b2b_to = employment_type['salary']['to'] * pln
                elif employment_type['salary']['currency'] == 'usd':
                    b2b_from = employment_type['salary']['from'] * usd
                    b2b_to = employment_type['salary']['to'] * usd
                else:
                    b2b_from = employment_type['salary']['from']
                    b2b_to = employment_type['salary']['to']
            else:
                b2b_from = None
                b2b_to = None
                
            if employment_type['type'] == 'permanent' and employment_type['salary'] is not None:
                if employment_type['salary']['currency'] == 'pln':
                    permanent_from = employment_type['salary']['from'] * pln
                    permanent_to = employment_type['salary']['to'] * pln
                elif employment_type['salary']['currency'] == 'usd':
                    permanent_from = employment_type['salary']['from'] * usd
                    permanent_to = employment_type['salary']['to'] * usd
                else:
                    permanent_from = employment_type['salary']['from']
                    permanent_to = employment_type['salary']['to']
            else:
                permanent_from = None
                permanent_to = None
                
        for skill in offer['skills']:
            #print(offer['title'])
            records.append({'offer_type': offer['marker_icon'],
                           'title': offer['title'],
                           'id': offer['id'],
                           'b2b_from': b2b_from,
                           'b2b_to': b2b_to,
                           'skill_name': skill['name'],
                           'skill_level': skill['level'],
                           'permanent_from': permanent_from,
                           'permanent_to': permanent_to,
                           'experience_level': offer['experience_level']})
    

# Création du DataFrame

In [6]:
df = pd.DataFrame(records)
df.sample(5)

Unnamed: 0,offer_type,title,id,b2b_from,b2b_to,skill_name,skill_level,permanent_from,permanent_to,experience_level
1962,data,Data Architect,crodu-data-architect-gdansk,6262.2,7693.56,ETL,5,,,senior
472,data,SPECJALISTA DS. POWER BI,adamed-specjalista-ds-power-bi-6612475a-94d3-4...,,,Business Intelligence,3,,,mid
671,data,Data Engineer,datumo-big-data-engineer-271dab9e-0719-42b1-b1...,2982.0,5325.0,Cloud,3,,,mid
212,data,Software Data Engineer (Python/Java),infogain-software-data-engineer-python-java-ka...,4686.0,5964.0,Cloud,4,,,mid
397,data,Data Scientist - CVM&Analytics Dep.,t-mobile-polska-s-a-data-scientist-cvm-analyti...,,,Python,3,,,mid


In [7]:
df['skill_name'].value_counts()

SQL              313
Python           241
ETL              101
AWS              101
Big Data          75
                ... 
Azure/AWS          1
TypeScript         1
SAS 4 GL           1
MDM                1
DWH/Data Mart      1
Name: skill_name, Length: 233, dtype: int64

In [8]:
# Affichage des 20 skills les plus demandés
top_skills = df.groupby(['skill_name'])['id'].nunique().sort_values(ascending=False).head(20).index
top_skills

Index(['SQL', 'Python', 'AWS', 'ETL', 'Big Data', 'T-SQL', 'Oracle',
       'Power BI', 'Data Science', 'Hadoop', 'PL/SQL', 'English', 'Azure',
       'Snowflake', 'Databases', 'Cloud', 'Spark', 'Machine Learning', 'Data',
       'Scala'],
      dtype='object', name='skill_name')

In [9]:
# Récupération des annonces contenant uniquement les 20 skills les plus demandés
df = df[df['skill_name'].isin(top_skills)]
df.sample(5)

Unnamed: 0,offer_type,title,id,b2b_from,b2b_to,skill_name,skill_level,permanent_from,permanent_to,experience_level
1903,data,Data Scientist / Analyst,capgemini-polska-data-scientist-analyst-rzeszow,,,Data Science,3,,,mid
2579,data,Data Quality Tester,idego-data-quality-tester-wroclaw,4260.0,5325.0,Power BI,3,,,senior
245,data,Data/IT Analyst,nordea-data-it-analyst,,,SQL,3,1917.0,2769.0,mid
1901,data,Data Scientist / Analyst,capgemini-polska-data-scientist-analyst-rzeszow,,,Data,3,,,mid
2340,data,Software Consultant,andea-software-consultant-7e8013c3-0cd6-4a1b-9...,1597.5,2130.0,English,4,,,mid


# Pivot

In [10]:
# Pivotage des 'skill_name' en colonnes
df = df.pivot(index = df.columns.drop(['skill_name', 'skill_level']),
         columns = 'skill_name',
         values = 'skill_level').fillna(0).reset_index()
df.sample(5)

skill_name,offer_type,title,id,b2b_from,b2b_to,permanent_from,permanent_to,experience_level,AWS,Azure,...,Machine Learning,Oracle,PL/SQL,Power BI,Python,SQL,Scala,Snowflake,Spark,T-SQL
281,data,Data Engineer (REMOTE),devsdata-llc-data-engineer-remote-gdansk,,,,,mid,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0
42,data,Analityk/Analityczka Danych,ringier-axel-springer-polska-analityk-analityc...,,,,,mid,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0
414,data,Database Administrator,provectus-database-administrator-67ab2850-71b2...,,,,,mid,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
756,data,Senior Technical Consultant (BA),infogain-senior-technical-consultant-ba,,,4899.0,6177.0,senior,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
208,data,Data Engineer,ipf-digital-data-engineer-krakow,,,,,mid,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0


In [11]:
print(f'\nLe DataFrame final contient {df.shape[0]} observations et {df.shape[1]} colonnes')


Le DataFrame final contient 859 observations et 28 colonnes


# Exportation du fichier

In [12]:
df.to_csv('justjoin.csv', index=False)