https://github.com/tej-prash/Job-Recommendation-System


https://github.com/611noorsaeed/Job-Recommendation-System-Machine-Learning/tree/main

https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b#:~:text=How%20does%20the%20BERT%20model,the%20meaning%20of%20the%20text.

In [4]:
import pandas as pd
import numpy as np

# Cleaning
from datetime import datetime, timedelta
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Similiraty 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from difflib import SequenceMatcher

# Save models
import pickle 

import warnings
warnings.filterwarnings("ignore")


In [2]:
# Please download it once
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\axeld\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\axeld\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\axeld\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [45]:
data_job = pd.read_csv('data\dice_com-job_us_sample.csv')
len(data_job)

22000

# **Recommendation by search**

When conducting a job search, it is crucial to provide job recommendations that precisely match the user's specific criteria. For this purpose, the recommendation function takes into consideration various aspects:

- **Same Title**: Similarity is assessed using vectorization and frequency analysis of the job title.

- **Same Description**: Comparison is performed using vectorization and frequency analysis of the job description.

- *(**Same Skills**: Relevance of required skills is evaluated through vectorization and frequency analysis of mentioned skills.)*

- **Same Company**: A bonus is awarded to jobs from the same company as the initial search.

- **Same Employment Status**: An additional bonus is granted to jobs sharing the same employment status as specified in the search.

- **Same Location**: A filtering function may be applied to return only jobs located in the same geographical region as the search.

- **Recently Posted**: The function can also filter results based on the recency of job postings, providing suggestions that align with freshness criteria.

This multi-criteria approach ensures that recommendations align comprehensively with the user's preferences, considering various factors such as semantic similarity, company affiliation, employment status, location, and posting recency.


In [46]:
data_job = data_job[['company', 'employmenttype_jobstatus', 'jobdescription', 
                     'joblocation_address', 'jobtitle', 'postdate', 'shift', 'skills']]

In [47]:
data_job.dropna(inplace=True)
data_job.drop_duplicates(inplace=True)

In [48]:
# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

def txt_cleaning(text):
    """
    Clean a given text, typically used for job descriptions and titles.

    Parameters:
    - text (str): The input text to be cleaned.

    Returns:
    str: The cleaned and processed text.

    Steps:
    1. Keep only alpha-numeric characters.
    2. Tokenize the text for better processing.
    3. Apply lemmatization to reduce words to their base form.
    4. Remove common English stop words.
    """
    # Keep only alpha-numeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = text.replace('\xa0', ' ')

    # Tokenize for better processing
    tokens = word_tokenize(text.lower())

    # Apply lemmatization and remove stop words using list comprehension
    lemmatization = [lemmatizer.lemmatize(w) for w in tokens if w not in stopwords.words('english')]

    return " ".join(lemmatization)

In [49]:
# 20 min to run !!!
print('Cleaning jobtitle')
data_job['jobtitle'] = data_job['jobtitle'].apply(lambda x: txt_cleaning(x))

print('Cleaning jobdescription')
data_job['jobdescription'] = data_job['jobdescription'].apply(lambda x: txt_cleaning(x))

def convert_date(date):
    # Remove ' ago' from the date
    date = date.replace(' ago', '')

    # If date is like 'moments ago'
    if len(date.split()) == 1:
        return datetime.now()

    # If date is like '7 minutes ago'
    if date.split(' ')[1] in ['minute', 'minutes']:
        minutes_ago = int(date.split(' ')[0])
        return datetime.now() - timedelta(minutes=minutes_ago)

    # If date is like '2 hours ago'
    if date.split(' ')[1] in ['hour', 'hours']:
        hours_ago = int(date.split(' ')[0])
        return datetime.now() - timedelta(hours=hours_ago)

    # If date is like '2 weeks ago'
    if date.split(' ')[1] in ['week', 'weeks']:
        weeks_ago = int(date.split(' ')[0])
        return datetime.now() - timedelta(weeks=weeks_ago)

    # If date is like '1 month ago'
    if date.split(' ')[1] in ['month', 'months']:
        months_ago = int(date.split(' ')[0])
        return datetime.now() - timedelta(days=30 * months_ago)

data_job['postdate'] = data_job['postdate'].apply(lambda x: convert_date(x))

data_job['skills'].fillna('', inplace=True)
data_job['skills'] = data_job['skills'].apply(lambda x: txt_cleaning(x))
data_job['requirements'] = data_job['jobtitle'].apply(lambda x: x.lower()) + ' ' + data_job['jobdescription'].apply(lambda x: x.lower()) + ' ' + data_job['skills'].apply(lambda x: x.lower())

data_job.head()
data_job.to_csv('data\data_job_clean.csv', header=True, index=False)

Cleaning jobtitle
Cleaning jobdescription


In [51]:
data_job.head()

Unnamed: 0,company,employmenttype_jobstatus,jobdescription,joblocation_address,jobtitle,postdate,shift,skills,requirements
0,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",looking selenium engineer must solid java codi...,"Atlanta, GA",automation test engineer,2024-01-15 11:16:01.533834,Telecommuting not available|Travel not required,see,automation test engineer looking selenium engi...
1,University of Chicago/IT Services,Full Time,university chicago rapidly growing security pr...,"Chicago, IL",information security engineer,2024-01-08 12:16:01.533834,Telecommuting not available|Travel not required,linux unix network monitoring incident respons...,information security engineer university chica...
2,"Galaxy Systems, Inc.",Full Time,galaxe solutionsevery day solution affect peop...,"Schaumburg, IL",business solution architect,2024-01-01 12:16:01.533834,Telecommuting not available|Travel not required,enterprise solution architecture business inte...,business solution architect galaxe solutionsev...
3,TransTech LLC,Full Time,java developerfull time direct hirebolingbrook...,"Bolingbrook, IL",java developer mid level ft great culture mode...,2024-01-01 12:16:01.533834,Telecommuting not available|Travel not required,please see job description,java developer mid level ft great culture mode...
4,Matrix Resources,Full Time,midtown based high tech firm immediate need in...,"Atlanta, GA",devops engineer,2024-01-15 11:28:01.533834,Telecommuting not available|Travel not required,configuration management developer linux manag...,devops engineer midtown based high tech firm i...


In [15]:
# Vectorization TF-IDF
data_job = pd.read_csv('data/data_job_clean.csv')
data_job = data_job[:len(data_job) // 8]
data_job['postdate'] = pd.to_datetime(data_job['postdate'])

vectorizer = TfidfVectorizer()
matrix_title = vectorizer.fit_transform(data_job['jobtitle'])
matrix_description = vectorizer.fit_transform(data_job['jobdescription'])

# Similarity calculus
similarities_title = cosine_similarity(matrix_title)
similarities_description = cosine_similarity(matrix_description)

In [8]:
# # Save the models in a file 
# with open('models\similarties_title.pkl', 'wb') as file:
#     pickle.dump(similarities_title, file)

# with open('models\similarities_description.pkl', 'wb') as file:
#     pickle.dump(similarities_description, file)

In [20]:
def detect_keywords(main_string, keywords):
    main_list = main_string.split(' ')
    keywords_list = keywords.split(' ')
    return all(keyword in main_list for keyword in keywords_list)

def find_best_search_indx(expression, data_job, top_n=50):
    expression = expression.lower()
    meilleurs_scores = [0] * top_n
    meilleurs_indices = [None] * top_n

    for index, job_title in enumerate(data_job['jobtitle']):
        job_title_lower = job_title.lower()
        score = SequenceMatcher(None, expression, job_title_lower).ratio()

        for i, top_score in enumerate(meilleurs_scores):
            if score > top_score:
                meilleurs_scores[i] = score
                meilleurs_indices[i] = index
                break

    return [index for index in meilleurs_indices if index is not None]

def search(input_text, **kwargs):
    """
    Search for a job in the dataset based on the input text and optional filters.

    Parameters:
    - input_text (str): Text entered by the user.
    - **kwargs (dict): Filters for company, employmenttype_jobstatus, and joblocation.

    Returns:
    pd.Series: The line in the dataset that matches the criteria, or None if no match is found.
    """
    # Security None in kwargs
    keys_to_remove = [col for col, value in kwargs.items() if value is None]
    for key in keys_to_remove:
        del kwargs[key]

    if input_text is None:
        return [value for _, value in data_job.head(10).T.to_dict().items()]

    # Initialize a mask to filter the dataset
    mask = (data_job['jobtitle'].apply(lambda x: detect_keywords(x, input_text)) | data_job['jobtitle'].str.contains(input_text, case=False, na=False))

    # If the mask is not empty = if the research is exactly found in the dataset
    if not mask[mask == True].empty:
        # Apply additional filters if provided
        if kwargs:
            for key, value in kwargs.items():
                mask &= (data_job[key] == value)

        # Get the matching row from the dataset
        result = data_job[mask]

        # Return the result (a DataFrame if there are matches, None otherwise)
        return [value for _,value in result.T.to_dict().items()] if not result.empty else [value for _, value in data_job.head(10).T.to_dict().items()]
    
    # else we look through similarity
    else:
        # Use the function to find the most similar job title
        result = find_best_search_indx(input_text, data_job)
        temp = data_job.loc[result]

        if result:
            # Apply additional filters if provided
            if kwargs:
                for key, value in kwargs.items():
                    mask = (data_job[key].apply(lambda x: detect_keywords(x, value)) | data_job[key].str.contains(value, case=False, na=False))
                    temp = temp[mask]

            # Return the result (a DataFrame if there are matches, None otherwise)
            return [value for _, value in temp.T.to_dict().items()] if not temp.empty else [value for _, value in data_job.head(10).T.to_dict().items()]
        else:
            return [value for _, value in data_job.head(10).T.to_dict().items()]

def calculate_recency_bonus(date_published):
    current_date = datetime.now()
    delta = current_date - date_published
    days_ago = delta.days
    return max(0, 0.5 - days_ago * 0.02)  # Bonus decreases linearly over time


def recommendation_search(searchs, **kwargs):
    """
    Search for job recommendations based on a given search result and optional filters.

    Parameters:
    - searchs (list): historical searchs from the user
    - **kwargs (dict): Optional filters for company, employmenttype_jobstatus, and joblocation.

    Returns:
    - pd.Series: Job titles that match the search criteria and filters, sorted by relevance. Top 11 as the first one is tu current search

    Notes:
    - The function calculates a relevance score based on the similarity of the job title and description.
    - Top 10 jobs with the highest relevance scores are returned.
    - Additional filters can be applied using **kwargs to refine the search.
    - A recency bonus is applied to prioritize more recent job postings.

    Formula used : score = 0.3 * title_similarity + 0.7 * description_similitary + sum(1 for each filter macthed) + max(0, 0.5 - days_ago * 0.02)

    """
    if searchs is None:
        return None
    
    result = {}
    for s in searchs:

        search_result = search(s)
        
        # Find index of the search
        search_result = pd.DataFrame(search_result)
        search_result = data_job.reset_index().merge(search_result, on=['company', 'jobdescription_old', 
                                                                    'joblocation_address', 'jobtitle_old', 'postdate',
                                                                    'jobtitle', 'jobdescription', 'requirements'], 
                                                                how='inner')

        for indx in search_result['index']:
            # Get similarities with this search
            score_title = list(enumerate(similarities_title[indx]))
            score_description = list(enumerate(similarities_description[indx]))

            # Calculate relevance scores for each job
            scores = {score_title[i][0]: score_title[i][1] * 0.3 + score_description[i][1] * 0.7 for i in range(len(score_title))}
            scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)[:11])

            # Apply additional filters if provided
            if len(kwargs) > 0:
                for idx, score in scores.items():
                    temp = data_job.loc[idx]
                    for col, value in kwargs.items():
                        if value.lower() in temp[col].lower():
                            scores[idx] += 1

            # Apply recency bonus
            for idx in scores.keys():
                recency_bonus = calculate_recency_bonus(data_job.loc[idx]['postdate'])
                scores[idx] += recency_bonus
            

        # Get top 10 jobs based on the combined relevance score and filters
        scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10])
        for k,v in scores.items():
            if k in result.keys():
                result[k] += v
            else:
                result[k] = v


    # Create a DataFrame with job titles, filter values, and scores
    res = data_job.loc[result.keys()][['jobtitle', 'jobdescription_old', 'company'] + list(kwargs.keys())]
    res['score'] = [result[idx] for idx in result]
    res.sort_values('score', ascending=False, inplace=True)
    
    return [value for _, value in res.head(10).T.to_dict().items()]

In [21]:
recommendation_search(['machine learning'])

[{'jobtitle': 'machine learning data scientist qpid boston',
  'jobdescription_old': 'Would you like to contribute creatively to a large, meaningful mission? We are in the processing of transforming healthcare. In particular, we are transforming the way that care givers across the country integrate information and make decisions.With the recent combination of QPID Health and eviCore healthcare, we now have an unprecedented combination of scale and industry leading technology and talent to change the way we experience healthcare.Specifically designed with the size and scale to address the complexity of today’s and tomorrow’s healthcare system, we are a company committed to advancing medical benefits management – and enabling better outcomes for patients, providers, and plans.Ours is an evidence-based approach that leverages our exceptional capabilities, powerful analytics, and an acute sensitivity to the challenges and needs of everyone involved across the healthcare spectrum. Applying 

# **Recommendation by profile**

Predict DevType:
- ~~BERT: Consider more columns than skills + groupby(DevType)~~ : to many type have the same skills

Work on job:
- Extract skills
- KNN on job
- ~~BERT on skills~~ : to many jobs 

In [3]:
# data_job = pd.read_csv('data\dice_com-job_us_sample.csv')
data_job = pd.read_csv('data\data_job_clean.csv')
data_job.head()

Unnamed: 0,company,employmenttype_jobstatus,jobdescription,joblocation_address,jobtitle,postdate,shift,skills
0,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,SEE BELOW
1,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,"linux/unix, network monitoring, incident respo..."
2,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,"Enterprise Solutions Architecture, business in..."
3,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Telecommuting not available|Travel not required,Please see job description
4,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,"Atlanta, GA",DevOps Engineer,48 minutes ago,Telecommuting not available|Travel not required,"Configuration Management, Developer, Linux, Ma..."


In [110]:
data_job['skills'].fillna('', inplace=True)
data_job['skills'] = data_job['skills'].apply(lambda x: txt_cleaning(x))
data_job['requirements'] = data_job['jobtitle'].apply(lambda x: x.lower()) + ' ' + data_job['jobdescription'].apply(lambda x: x.lower()) + ' ' + data_job['skills'].apply(lambda x: x.lower())

data_job.head()
data_job.to_csv('data\data_job_clean.csv', header=True, index=False)

In [58]:
SKILLS_USER = [
    "Python",
    "Java",
    "C++",
    "JavaScript",
    "HTML/CSS",
    "SQL",
    "Git",
]

SKILLS_USER = [' '.join(SKILLS_USER)]

SKILLS_JOBS = data_job['requirements']

In [59]:
vectorizer = TfidfVectorizer()
skills_tfidf = vectorizer.fit_transform(SKILLS_USER)
requirements_tfidf = vectorizer.transform(SKILLS_JOBS)


In [61]:
# Définir le nombre de voisins à considérer
k = 10
nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
nn_model.fit(requirements_tfidf)

# Trouver les postes les plus pertinents pour chaque compétence
for skill in SKILLS_USER:
    skill_tfidf = vectorizer.transform([skill])
    _, indices = nn_model.kneighbors(skill_tfidf)

    recommended_jobs = data_job.loc[indices[0], 'jobtitle'].values
    print(f"Pour la compétence '{skill}', les postes recommandés sont : {', '.join(recommended_jobs)}")


Pour la compétence 'Python Java C++ JavaScript HTML/CSS SQL Git', les postes recommandés sont : Full Stack PHP Engineer, Full Stack Developer, Full Stack Developer, Sr. iOS Developer, Software Engineer, Full Stack Developer, Angular Mobile IOS Developer, Senior Full Stack Developer, M.E.A.N. Stack Developer, Senior Software Developer


In [None]:
def recommendation_job(skill, requirements_tfidf):
    k = 10
    nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
    nn_model.fit(requirements_tfidf)

    # Trouver les postes les plus pertinents pour chaque compétence
    for skill in SKILLS_USER:
        skill_tfidf = vectorizer.transform([skill])
        _, indices = nn_model.kneighbors(skill_tfidf)

        recommended_jobs = data_job.loc[indices[0], 'jobtitle'].values



# **Research**

In [150]:
data_job = pd.read_csv('data/data_job_clean.csv')
data_job.columns

Index(['company', 'employmenttype_jobstatus', 'jobdescription_old',
       'joblocation_address', 'jobtitle_old', 'postdate', 'shift', 'skills',
       'jobtitle', 'jobdescription', 'requirements'],
      dtype='object')

In [158]:
data_job = data_job[['company', 'jobdescription_old',
       'joblocation_address', 'jobtitle_old', 'postdate',
       'jobtitle', 'jobdescription', 'requirements']]

data_job = data_job[:len(data_job)-1000]

data_job.to_csv('data/data_job_clean.csv')

### **BERT on skills**

Let's try to build our label column, grouping as much as possible jobs together by their title.

In [126]:
temp = data_job.copy()
temp['jobtitle'] = temp['jobtitle'].apply(lambda x: x.lower().strip())
nb_job = len(temp['jobtitle'].unique())

In [127]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# Prétraitement des données
stop_words = stopwords.words('english')  # Utilisez la liste directement, pas un ensemble
vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(temp['jobtitle'].fillna(''))

# Calculer la similarité du cosinus
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Définir un seuil de similarité
threshold = 1  # Vous pouvez ajuster ce seuil en fonction de vos besoins

threshold = 0.8
similar_indices = cosine_similarities[0].argsort()[::-1][cosine_similarities[0][cosine_similarities[0].argsort()[::-1]] > threshold][1:]

similar_indices

array([15236,  8484,  5381, 21497,  2483,  3912,  9879, 11127, 11447,
        1771, 20440, 16035, 13393, 12905, 15059, 19279, 17690,  6444,
       17546,   299,  8849, 17022, 15650, 10192], dtype=int64)

In [128]:
# Regrouper les jobtitles similaires
met = []
for i, row in tqdm(temp.iterrows()):
    # # If we already met this title
    # if i in met:
    #     continue

    # Keep the index with similarity > treshold except the first one that is the current index
    similar_indices = cosine_similarities[i].argsort()[::-1][cosine_similarities[i][cosine_similarities[i].argsort()[::-1]] > threshold][1:] 
    similar_indices = [idx for idx in similar_indices if idx not in met]

    # # Store index already met
    # met.append(i)
    # met = met + list(similar_indices)

    # Update title 
    for idx in similar_indices:
        temp.loc[idx, 'jobtitle'] = temp.loc[i, 'jobtitle']




22000it [00:55, 397.58it/s]


In [129]:
temp2 = temp.copy()
temp2['jobtitle'] = temp2['jobtitle'].apply(lambda x: x.lower().strip())
nb_job_after_clean = len(temp2['jobtitle'].unique())

print(f'Nb jobs before cleaning: {nb_job}\nNb jobs after cleaning: {nb_job_after_clean}')

Nb jobs before cleaning: 14928
Nb jobs after cleaning: 9985


### **BERT on profile**

Let's try to groupby the DevType to see if we can have specific skills

In [130]:
data_user = pd.read_csv('data\survey_results_public.csv')
data_user.head()

Unnamed: 0,Respondent,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,...,Exercise,Gender,SexualOrientation,EducationParents,RaceEthnicity,Age,Dependents,MilitaryUS,SurveyTooLong,SurveyEasy
0,1,Yes,No,Kenya,No,Employed part-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,20 to 99 employees,Full-stack developer,...,3 - 4 times per week,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Black or of African descent,25 - 34 years old,Yes,,The survey was an appropriate length,Very easy
1,3,Yes,Yes,United Kingdom,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)","A natural science (ex. biology, chemistry, phy...","10,000 or more employees",Database administrator;DevOps specialist;Full-...,...,Daily or almost every day,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",White or of European descent,35 - 44 years old,Yes,,The survey was an appropriate length,Somewhat easy
2,4,Yes,Yes,United States,No,Employed full-time,Associate degree,"Computer science, computer engineering, or sof...",20 to 99 employees,Engineering manager;Full-stack developer,...,,,,,,,,,,
3,5,No,No,United States,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",100 to 499 employees,Full-stack developer,...,I don't typically exercise,Male,Straight or heterosexual,Some college/university study without earning ...,White or of European descent,35 - 44 years old,No,No,The survey was an appropriate length,Somewhat easy
4,7,Yes,No,South Africa,"Yes, part-time",Employed full-time,Some college/university study without earning ...,"Computer science, computer engineering, or sof...","10,000 or more employees",Data or business analyst;Desktop or enterprise...,...,3 - 4 times per week,Male,Straight or heterosexual,Some college/university study without earning ...,White or of European descent,18 - 24 years old,Yes,,The survey was an appropriate length,Somewhat easy


In [133]:
# Columns that are about skills
skills_col = ['LanguageWorkedWith','DatabaseWorkedWith',
              'PlatformWorkedWith','FrameworkWorkedWith',
              'IDE','OperatingSystem']

# Keep relevant columns
data_user_skills = data_user[['Respondent', 'DevType'] + skills_col]

# Fill nan to '' to not disturb the union
data_user_skills.fillna('', inplace=True)

# Skills union
data_user_skills['skills'] = (data_user_skills['LanguageWorkedWith'].apply(lambda x: ' '.join(x.split(';')) + ' ') + 
                              data_user_skills['DatabaseWorkedWith'].apply(lambda x: ' '.join(x.split(';')) + ' ') + 
                              data_user_skills['PlatformWorkedWith'].apply(lambda x: ' '.join(x.split(';')) + ' ') + 
                              data_user_skills['FrameworkWorkedWith'].apply(lambda x: ' '.join(x.split(';')) + ' ') + 
                              data_user_skills['IDE'].apply(lambda x: ' '.join(x.split(';')) + ' ') + 
                              data_user_skills['OperatingSystem'].apply(lambda x: ' '.join(x.split(';')))).str.strip()

# Drop additionnal skills between parenthesis
data_user_skills['skills'] = data_user_skills['skills'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))

# Drop empty skills row
data_user_skills = data_user_skills[~(data_user_skills['skills'] == '')]

# Drop duplicates in skills
data_user_skills['skills'] = data_user_skills['skills'].apply(lambda x: set(x.split()))

# Keep relevant columns
data_user_skills = data_user_skills[['Respondent', 'DevType', 'skills']]

# Explode DevType to groupby after
data_user_skills['DevType'] = data_user_skills['DevType'].str.split(';')
data_user_skills['DevType'] = data_user_skills['DevType'].explode('DevType').reset_index(drop=True)

grouped_data = data_user_skills.groupby('DevType').agg({
    'Respondent': list,
    'skills': lambda x: set.union(*x)
    }).reset_index()

grouped_data


Unnamed: 0,DevType,Respondent,skills
0,,"[350, 357, 898, 1280, 1350, 1581, 1939, 2193, ...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
1,Back-end developer,"[34, 46, 50, 53, 63, 71, 79, 91, 95, 105, 111,...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
2,"C-suite executive (CEO, CTO, etc.)","[37, 245, 536, 662, 667, 928, 1095, 1239, 1796...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
3,Data or business analyst,"[9, 38, 54, 419, 448, 484, 565, 569, 596, 730,...","{Cobol, TextMate, Go, Atom, Amazon, Server, Ma..."
4,Data scientist or machine learning specialist,"[55, 142, 236, 323, 403, 420, 438, 445, 513, 5...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
5,Database administrator,"[3, 39, 56, 59, 72, 124, 135, 219, 247, 263, 3...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
6,Designer,"[29, 75, 112, 122, 146, 163, 264, 315, 468, 50...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
7,Desktop or enterprise applications developer,"[10, 66, 76, 168, 187, 196, 213, 279, 371, 387...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
8,DevOps specialist,"[41, 81, 125, 214, 237, 304, 361, 400, 469, 57...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."
9,Educator or academic researcher,"[164, 240, 708, 1079, 1146, 1227, 1309, 1344, ...","{Cobol, TextMate, Go, Atom, Server, Amazon, Ma..."


In [132]:
def get_job_from_user():
    return data_user['DevType'].dropna().str.split(';').explode('DevType').reset_index(drop=True).unique()

print(get_job_from_user())


['Full-stack developer' 'Database administrator' 'DevOps specialist'
 'System administrator' 'Engineering manager' 'Data or business analyst'
 'Desktop or enterprise applications developer'
 'Game or graphics developer' 'QA or test developer' 'Student'
 'Back-end developer' 'Front-end developer' 'Designer'
 'C-suite executive (CEO, CTO, etc.)' 'Mobile developer'
 'Data scientist or machine learning specialist'
 'Marketing or sales professional' 'Product manager'
 'Embedded applications or devices developer'
 'Educator or academic researcher']
