In [104]:
import pandas as pd
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np
from scrape_data import run_scrapper
from iteration_utilities import flatten
from collections import Counter
import plotly.express as px


In [63]:
first_page = 'https://www.linkedin.com/jobs/search/?currentJobId=3433766873&f_JT=F&f_T=25206&geoId=101282230&keywords=machine%20learning%20engineer&location=Germany&refresh=true&sortBy=R'
df = run_scrapper(first_page)
df.head()

Unnamed: 0,job_title,job_post_date,job_page_link,job_description
0,Machine Learning Engineer (m/w/d),2023-02-21,https://de.linkedin.com/jobs/view/machine-lear...,Wir suchen eine/n Machine Learning Engineer/in...
1,AI / Machine Learning Engineer,2023-03-03,https://de.linkedin.com/jobs/view/ai-machine-l...,Key ResponsibilitiesDevelop and implement AI m...
2,Senior Machine Learning Engineer (Berlin),2023-02-12,https://de.linkedin.com/jobs/view/senior-machi...,Hey there! 👋Are you an ambitious Senior Machin...
3,Machine Learning and AI Engineer,2023-03-13,https://de.linkedin.com/jobs/view/machine-lear...,Machine Learning and AI Engineer www.cyberfame...
4,Machine Learning Engineer (m/f/d),2023-03-08,https://de.linkedin.com/jobs/view/machine-lear...,Who We Aredelphai is in one of the most exciti...


In [64]:
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])
# Load pipeline
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

In [65]:
data_keyword = []
def get_keywords(text):
    return extractor(text)
# text_data = df['job_description'].to_list()
df['key_words'] = df.apply(lambda x:get_keywords(x.job_description), axis=1)
df.head()

Unnamed: 0,job_title,job_post_date,job_page_link,job_description,key_words
0,Machine Learning Engineer (m/w/d),2023-02-21,https://de.linkedin.com/jobs/view/machine-lear...,Wir suchen eine/n Machine Learning Engineer/in...,"[Cloud Computing, GCloud, Lambda, Paramax, PyT..."
1,AI / Machine Learning Engineer,2023-03-03,https://de.linkedin.com/jobs/view/ai-machine-l...,Key ResponsibilitiesDevelop and implement AI m...,"[Keras, Machine Learning, Pytorch, Tensorflow,..."
2,Senior Machine Learning Engineer (Berlin),2023-02-12,https://de.linkedin.com/jobs/view/senior-machi...,Hey there! 👋Are you an ambitious Senior Machin...,"[Enjins, Kafka, Kubernetes, Machine Learning, ..."
3,Machine Learning and AI Engineer,2023-03-13,https://de.linkedin.com/jobs/view/machine-lear...,Machine Learning and AI Engineer www.cyberfame...,"[Data Engineering, Ethereum Network, Machine L..."
4,Machine Learning Engineer (m/f/d),2023-03-08,https://de.linkedin.com/jobs/view/machine-lear...,Who We Aredelphai is in one of the most exciti...,"[Ops, communication skills, company data integ..."


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        500 non-null    object
 1   job_post_date    500 non-null    object
 2   job_page_link    500 non-null    object
 3   job_description  500 non-null    object
 4   key_words        500 non-null    object
dtypes: object(5)
memory usage: 23.4+ KB


In [67]:
df.groupby('job_post_date').count()

Unnamed: 0_level_0,job_title,job_page_link,job_description,key_words
job_post_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-09-13,1,1,1,1
2023-01-15,17,17,17,17
2023-01-19,2,2,2,2
2023-01-24,1,1,1,1
2023-02-01,1,1,1,1
2023-02-04,8,8,8,8
2023-02-05,19,19,19,19
2023-02-09,17,17,17,17
2023-02-12,34,34,34,34
2023-02-15,1,1,1,1


In [69]:
df['group_date'] = df['job_post_date'].apply(lambda x: 'M' if x >= '2023-03-01' else 'O')
df.head()

Unnamed: 0,job_title,job_post_date,job_page_link,job_description,key_words,group_date
0,Machine Learning Engineer (m/w/d),2023-02-21,https://de.linkedin.com/jobs/view/machine-lear...,Wir suchen eine/n Machine Learning Engineer/in...,"[Cloud Computing, GCloud, Lambda, Paramax, PyT...",O
1,AI / Machine Learning Engineer,2023-03-03,https://de.linkedin.com/jobs/view/ai-machine-l...,Key ResponsibilitiesDevelop and implement AI m...,"[Keras, Machine Learning, Pytorch, Tensorflow,...",M
2,Senior Machine Learning Engineer (Berlin),2023-02-12,https://de.linkedin.com/jobs/view/senior-machi...,Hey there! 👋Are you an ambitious Senior Machin...,"[Enjins, Kafka, Kubernetes, Machine Learning, ...",O
3,Machine Learning and AI Engineer,2023-03-13,https://de.linkedin.com/jobs/view/machine-lear...,Machine Learning and AI Engineer www.cyberfame...,"[Data Engineering, Ethereum Network, Machine L...",M
4,Machine Learning Engineer (m/f/d),2023-03-08,https://de.linkedin.com/jobs/view/machine-lear...,Who We Aredelphai is in one of the most exciti...,"[Ops, communication skills, company data integ...",M


In [71]:
march_data = df[df['group_date']=='M']
february_data = df[df['group_date']!= 'M']

In [135]:
def get_skills_count(data,column_name, number, indicator):
    skill_words = list(flatten(data[column_name].tolist()))
    word_skills = list(map(str.lower, skill_words))
    freq = Counter(word_skills)
    del freq['machine learning']
    del freq['machine']
    del freq['data science']
    values, count = zip(*freq.most_common(number))
    # values, counts = zip(*common_skill.most_common(5))
    data_dict = dict(freq)
    df = pd.DataFrame(dict(freq_count=data_dict.values(),skills= data_dict.keys(), group= indicator))
    return values, df 

f_values, skill_score_f = get_skills_count(february_data,'key_words', 5,'o')
m_values, skill_score_m = get_skills_count(march_data,'key_words', 5, 'm')

In [136]:
merged_data = pd.concat([skill_score_f, skill_score_m], ignore_index= True)
merged_data.head()

Unnamed: 0,freq_count,skills,group
0,15,cloud computing,o
1,15,gcloud,o
2,15,lambda,o
3,15,paramax,o
4,46,pytorch,o


In [152]:
list(merged_data[merged_data['group']=='m'].nlargest(5, 'freq_count')['skills'])

['tensorflow',
 'pytorch',
 'software engineering',
 'deep learning',
 'communication skills']

In [133]:
m_values

('tensorflow',
 'pytorch',
 'software engineering',
 'deep learning',
 'communication skills')

In [137]:
merged_data.to_csv("skills_by_month.csv", index=False)

In [138]:
focus_skills = merged_data[merged_data['skills'].isin(list(set(f_values + m_values)))]

In [143]:
focus_skills = focus_skills.pivot_table('freq_count', 'skills', 'group').reset_index()

In [140]:
# fig = px.line_polar(focus_skills, r = 'freq_count', color = 'group',theta = 'skills', line_close = True)
# fig.update_traces(fill = 'toself')

# fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [147]:
import plotly.graph_objects as go

categories = ["communication skills","pytorch", "tensorflow", "kubernetes"]

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=list(focus_skills['o']),
      theta= list(focus_skills['skills']),
      fill='toself',
      name='February'
))
fig.add_trace(go.Scatterpolar(
      r=list(focus_skills['m']),
      theta= list(focus_skills['skills']),
      fill='toself',
      name='March'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      # range=[0, 100]
    )),
  showlegend=False
)

fig.show()


In [None]:
def get_top_skills(data,column_name, number, indicator):
    skill_words = list(flatten(data[column_name].tolist()))
    word_skills = list(map(str.lower, skill_words))
    freq = Counter(word_skills)
    data_for_chart = freq.most_common(number)
    data_dict = dict(data_for_chart)
    return 

: 

In [None]:
# import plotly.express as px
# # data_dict = dict(data_for_chart)
# # df = pd.DataFrame(dict(
# #     r=data_dict.values(),
# #     theta= data_dict.keys()))
# fig = px.line_polar(df, r='r', theta='theta', line_close=True)
# fig.update_traces(fill='toself')
# fig.show()

: 

In [None]:
# dict(data_for_chart)

: 

In [None]:
# df.to_csv('job_data_15_03.csv', index=False)

: 

: 