In [7]:
import pandas as pd
import re
import plotly.express as px

In [8]:
df = pd.read_csv('dataset/job-64bd250cb0899621267166.csv', encoding='latin-1')
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."


In [3]:
# Renommer les colonnes avec des noms en minuscules et underscores
df.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)
# Renommer la colonne "requirment_of_the_company_" en "requirement_of_the_company"
df.rename(columns={'requirment_of_the_company_': 'requirements'}, inplace=True)
df.columns

Index(['company', 'job_title', 'location', 'job_type', 'experience_level',
       'salary', 'requirements', 'facilities'],
      dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3198 entries, 0 to 3197
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Company                     3197 non-null   object
 1   Job Title                   3197 non-null   object
 2   Location                    3197 non-null   object
 3   Job Type                    3197 non-null   object
 4   Experience level            2962 non-null   object
 5   Salary                      3009 non-null   object
 6   Requirment of the company   3198 non-null   object
 7   Facilities                  3198 non-null   object
dtypes: object(8)
memory usage: 200.0+ KB


In [36]:
#affichage du nombre des doublons
df.duplicated().sum()

202

In [37]:
# Supprimer les doublons du DataFrame
df = df.drop_duplicates()
df.duplicated().sum()

0

**Data preprocessing**

In [38]:
## Vérifier le nombre de valeurs manquantes par colonne
missing_values = df.isnull().sum()
missing_values

company               1
job_title             1
location              1
job_type              1
experience_level    228
salary              172
requirements          0
facilities            0
dtype: int64

In [39]:
# Créer la heatmap des valeurs manquantes
fig = px.imshow(df.isnull(), color_continuous_scale='Viridis')
fig.update_layout(title_text='Heatmap des valeurs manquantes')
fig.show()

In [40]:
df.loc[df['job_title'].isna()]

Unnamed: 0,company,job_title,location,job_type,experience_level,salary,requirements,facilities
797,,,,,,,",,,,,",",,,,"


In [41]:
# Supprimer la ligne avec l'index 797
df.drop(797, inplace=True)

In [42]:
#df.loc[df['Experience level'].isna()]
df['job_type'].unique().tolist()

['Full Time', 'Internship', 'Part Time']

In [43]:
# Remplir les valeurs manquantes de la colonne "Experience level" par 'no-specific'
df['experience_level'].fillna('no-specific', inplace=True)

In [44]:
df['job_type'].unique().tolist()

['Full Time', 'Internship', 'Part Time']

In [45]:
# Remplir toutes les valeurs manquantes de la colonne "Salary" par 0
df['salary'].fillna(0, inplace=True)

In [46]:
# Fonction pour vérifier si un salaire a une étoile (*)
def has_star(salary):
    return True if '*' in str(salary) else False    
# Ajouter la colonne "salary_conditions" pour indiquer si un salaire a une étoile (*)
df['salary_conditions'] = df['salary'].apply(has_star)

In [47]:
# Fonction pour extraire la devise à partir de la valeur de salaire en utilisant des regex
def extract_currency(salary):
    if isinstance(salary, str):
        match = re.search(r'(EUR|GBP)', salary)
        if match:
            return match.group()
    return 'USD'

# Ajouter une colonne "Devise" pour stocker les informations sur la devise
df['currency'] = df['salary'].apply(extract_currency)

In [48]:
# Fonction pour nettoyer la valeur de salaire
def clean_salary(salary):
    # Vérifier si la valeur de salaire est une chaîne de caractères
    if isinstance(salary, str):
        # Extraire les parties numériques à l'aide d'expressions régulières
        matches = re.findall(r'\d+', salary)
        
        if len(matches) > 0:
            # Joindre les parties numériques pour former un nombre entier
            cleaned_salary = int(''.join(matches))
            
            # Vérifier si la valeur contient un "K" pour multiplier par 1000
            if 'K' in salary:
                cleaned_salary *= 1000
        else:
            # Si aucune partie numérique n'est trouvée, utiliser la valeur "None"
            cleaned_salary = None
    else:
        # Si la valeur n'est pas une chaîne de caractères, utiliser la valeur "None"
        cleaned_salary = None
    
    return cleaned_salary

# Appliquer la fonction de nettoyage sur la colonne "Salaire"
df['salary'] = df['salary'].apply(clean_salary)

In [49]:
# Fonction pour multiplier le salaire en fonction de la devise
def multiply_salary(row):
    salary = row['salary']
    currency = row['currency']
    
    if currency == 'GBP':
        return salary * 1.30
    elif currency == 'EUR':
        return salary * 1.11
    else:
        return salary

# Appliquer la fonction de multiplication sur la colonne "Salaire"
df['salary'] = df.apply(multiply_salary, axis=1)

In [50]:
df.loc[:, 'facilities'] = df['facilities'].str.replace(',+', ',')
df.loc[:, 'facilities'] = df['facilities'].str.split(',').apply(lambda x: [item for item in x if item])
df['facilities'].replace('[]', "['Not found']", inplace = True)

In [51]:
df.loc[:, 'requirements'] = df['requirements'].str.replace(',+', ',')
df.loc[:, 'requirements'] = df['requirements'].str.split(',').apply(lambda x: [item for item in x if item])

In [52]:
# #Convert the "Location" column to lowercase
# df['Location'] = df['location'].str.lower()
# df['job_title'] = df['job_title'].str.lower()

In [53]:
# Rechercher le mot "remote" dans les colonnes "Location" et "Job Title"
df['Is Remote'] = df['location'].str.contains(r'\bremote\b', case=False) | df['job_title'].str.contains(r'\bremote\b', case=False)
# Remplacer les valeurs booléennes par 'remote' ou 'local' dans une nouvelle colonne
df['work_mode'] = df['Is Remote'].replace({True: 'remote', False: 'onsite'})

In [54]:
from geotext import GeoText

# Fonction pour extraire le nom du pays à partir de la colonne "Location"
def extract_country(location):
    places = GeoText(location)
    if places.countries:
        return places.countries[0]
    else:
        return None

# Fonction pour extraire le nom de la ville à partir de la colonne "Location"
def extract_city(location):
    places = GeoText(location)
    if places.cities:
        return places.cities[0]
    else:
        return None

# Appliquer les fonctions sur la colonne "Location" pour extraire les noms de pays et de villes
df['country'] = df['location'].apply(extract_country)
df['city'] = df['location'].apply(extract_city)

In [23]:
df.loc[df['company'] == 'DeepL']

Unnamed: 0,company,job_title,location,job_type,experience_level,salary,requirements,facilities,salary_conditions,currency,Is Remote,work_mode,country,city
19,DeepL,"Data Scientist | Insights (f/m/d) - GER, UK, N...",Remote job,Full Time,Senior-level,129000.0,"[Agile, Data visualization, Economics, Machine...",[Career development],True,USD,True,remote,,
47,DeepL,Data Analyst | Marketing or Sales (f/m/d) - GE...,Remote job,Full Time,Senior-level,92000.0,"[Agile, Data analysis, Data visualization, Eco...","[Career development, Team events]",True,USD,True,remote,,


In [24]:
from geopy.geocoders import Nominatim

# Créez une instance du géocodeur Nominatim
geolocator = Nominatim(user_agent="city_country_fill")

# Fonction pour récupérer le pays en fonction de la ville
def get_country(row):
    city = row['city']
    country = row['country']
    if country is None and city:
        try:
            location = geolocator.geocode(city)
            if location:
                return location.address.split(",")[-1].strip()
            else:
                return None
        except:
            return None
    else:
        return country


# Remplir les valeurs manquantes dans la colonne "country" à partir des villes
df['country'] = df.apply(get_country, axis=1)

In [25]:
problematic_rows = df.loc[(df['work_mode'] == 'onsite') & (df['country'].isna())]
problematic_rows.shape

(87, 14)

In [55]:
# Remplir les valeurs manquantes dans la colonne "city" et "country" avec "Not found"
df['city'] = df['city'].fillna("Not found")
df['country'] = df['country'].fillna("Not found")

In [56]:
df.to_csv('job_opportunities_clean.csv')

In [58]:
df

Unnamed: 0,company,job_title,location,job_type,experience_level,salary,requirements,facilities,salary_conditions,currency,Is Remote,work_mode,country,city
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48000.0,"[Computer Science, Data quality, Genetics, Mat...",[],True,USD,False,onsite,United States,Richardson
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48000.0,"[Agile, Data management, Finance, Security]",[],True,USD,False,onsite,Mauritius,Not found
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,no-specific,90000.0,"[Agile, Architecture, AWS, Computer Science, C...",[Career development],True,USD,False,onsite,United States,South Jordan
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48000.0,"[Engineering, Industrial, Oracle, Power BI, R,...",[],True,USD,False,onsite,Italy,Not found
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108000.0,"[AWS, Azure, Computer Science, Consulting, Dat...","[Flex hours, Flex vacation, Parental leave, Un...",False,USD,False,onsite,United States,Arlington
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,CCRi,"Application Integration Engineer, Computer Vis...","Chantilly, Virginia, United States",Full Time,Mid-level,113000.0,"[Agile, Angular, APIs, Architecture, AWS, Azure]","[401(k) matching, Career development, Flex hou...",False,USD,False,onsite,United States,Chantilly
3191,Publicis Groupe,"Associate Director, Data Science","New York City, United States",Full Time,Mid-level,106000.0,"[Bayesian, Classification, Clustering, Data an...","[Career development, Health care]",False,USD,False,onsite,United States,New York
3192,DoorDash,"Senior Software Engineer, Machine Learning - A...","Sunnyvale, CA; San Francisco, CA; New York",Full Time,Senior-level,176000.0,"[Computer Science, Data analysis, Engineering,...","[401(k) matching, Career development, Equity, ...",False,USD,False,onsite,Not found,Sunnyvale
3193,Western Digital,Data Scientist - New College Graduate,"Biñan, Philippines",Full Time,Entry-level,39000.0,"[APIs, Clustering, Computer Science, Data visu...",[Career development],True,USD,False,onsite,Philippines,Not found
