In [9]:
import pandas as pd
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np
from scrape_data import run_scrapper
from iteration_utilities import flatten
from collections import Counter
import plotly.express as px


In [2]:
first_page = 'https://www.linkedin.com/jobs/search/?currentJobId=3433766873&f_JT=F&f_T=25206&geoId=101282230&keywords=machine%20learning%20engineer&location=Germany&refresh=true&sortBy=R'
df = run_scrapper(first_page)
df.head()

Unnamed: 0,job_title,job_post_date,job_page_link,job_description
0,Machine Learning Engineer (m/w/d),2023-02-21,https://de.linkedin.com/jobs/view/machine-lear...,Wir suchen eine/n Machine Learning Engineer/in...
1,AI / Machine Learning Engineer,2023-03-03,https://de.linkedin.com/jobs/view/ai-machine-l...,Key ResponsibilitiesDevelop and implement AI m...
2,Senior Machine Learning Engineer (Berlin),2023-02-12,https://de.linkedin.com/jobs/view/senior-machi...,Hey there! 👋Are you an ambitious Senior Machin...
3,Machine Learning and AI Engineer,2023-03-13,https://de.linkedin.com/jobs/view/machine-lear...,Machine Learning and AI Engineer www.cyberfame...
4,Machine Learning Engineer (m/f/d),2023-03-08,https://de.linkedin.com/jobs/view/machine-lear...,Who We Aredelphai is in one of the most exciti...


In [3]:
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])
# Load pipeline
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

In [4]:
data_keyword = []
def get_keywords(text):
    return extractor(text)
# text_data = df['job_description'].to_list()
df['key_words'] = df.apply(lambda x:get_keywords(x.job_description), axis=1)
df.head()

Unnamed: 0,job_title,job_post_date,job_page_link,job_description,key_words
0,Machine Learning Engineer (m/w/d),2023-02-21,https://de.linkedin.com/jobs/view/machine-lear...,Wir suchen eine/n Machine Learning Engineer/in...,"[Cloud Computing, GCloud, Lambda, Paramax, PyT..."
1,AI / Machine Learning Engineer,2023-03-03,https://de.linkedin.com/jobs/view/ai-machine-l...,Key ResponsibilitiesDevelop and implement AI m...,"[Keras, Machine Learning, Pytorch, Tensorflow,..."
2,Senior Machine Learning Engineer (Berlin),2023-02-12,https://de.linkedin.com/jobs/view/senior-machi...,Hey there! 👋Are you an ambitious Senior Machin...,"[Enjins, Kafka, Kubernetes, Machine Learning, ..."
3,Machine Learning and AI Engineer,2023-03-13,https://de.linkedin.com/jobs/view/machine-lear...,Machine Learning and AI Engineer www.cyberfame...,"[Data Engineering, Ethereum Network, Machine L..."
4,Machine Learning Engineer (m/f/d),2023-03-08,https://de.linkedin.com/jobs/view/machine-lear...,Who We Aredelphai is in one of the most exciti...,"[Ops, communication skills, company data integ..."


In [6]:
# df.to_csv('job_data_16_03.csv', index=False)
march_data = df[df['job_post_date']>= '2023-03-01']
february_data = df[df['job_post_date']<= '2023-03-01']

In [7]:


def get_top_skills_df(data,column_name, number, indicator):
    skill_words = list(flatten(data[column_name].tolist()))
    word_skills = list(map(str.lower, skill_words))
    freq = Counter(word_skills)
    data_for_chart = freq.most_common(number)
    data_dict = dict(data_for_chart)
    df = pd.DataFrame(dict(freq_count=data_dict.values(),skills= data_dict.keys(), group= indicator))
    return df

In [10]:
feb_skills = get_top_skills_df(february_data,'key_words', 10, 'F')
mar_skills = get_top_skills_df(march_data, 'key_words',10, 'M')

In [25]:
mar_skills = mar_skills[~mar_skills['skills'].isin(["machine learning", "machine", "ik", "data science"])]
# mar_skills

In [44]:
mar_skills

Unnamed: 0,freq_count,skills,group
1,39,software engineering,M
2,33,tensorflow,M
3,29,communication skills,M
5,26,cloud,M
6,25,pytorch,M
7,24,kubernetes,M
9,23,python,M


In [26]:
feb_skills = feb_skills[~feb_skills['skills'].isin(["data science", "machine learning", "machine"])]
# feb_skills

In [53]:
merged_data = pd.concat([feb_skills, mar_skills], ignore_index= True)

In [30]:
merged_data.to_csv("skills_by_month.csv", index=False)


In [54]:
merged_data = merged_data[merged_data['skills'].isin(["communication skills","pytorch", "tensorflow", "kubernetes"])]

In [55]:
merged_data

Unnamed: 0,freq_count,skills,group
0,30,pytorch,F
1,30,tensorflow,F
4,26,kubernetes,F
6,18,communication skills,F
8,33,tensorflow,M
9,29,communication skills,M
11,25,pytorch,M
12,24,kubernetes,M


In [57]:
fig = px.line_polar(merged_data, r = 'freq_count', color = 'group',theta = 'skills', line_close = False)
fig.update_traces()

fig.show()

In [None]:
# from collections import Counter
# word_skills = list(map(str.lower, skill_words))
# freq = Counter(word_skills)
# data_for_chart = freq.most_common(10)


: 

In [None]:
# import plotly.express as px

: 

In [None]:
# import plotly.express as px
# # data_dict = dict(data_for_chart)
# # df = pd.DataFrame(dict(
# #     r=data_dict.values(),
# #     theta= data_dict.keys()))
# fig = px.line_polar(df, r='r', theta='theta', line_close=True)
# fig.update_traces(fill='toself')
# fig.show()

: 

In [None]:
# dict(data_for_chart)

: 

In [None]:
# df.to_csv('job_data_15_03.csv', index=False)

: 

: 