In [1]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import plotly.graph_objects as go


# Web Scrapper to collect data about tech job postings on linkedin

## 1.Use beautifulsoup to fetch html components from linkedin

In [2]:
scrapping_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=software%2Bengineer&location=United%2BKingdom&geoId=101165590&trk=public_jobs_jobs-search-bar_search-submit&start=0"
response = requests.get(scrapping_url)

soup = BeautifulSoup(response.text, 'html.parser')
job_listing = soup.find_all('li')


In [3]:
id_list=[]
for job in job_listing:
    base_card_div = job.find('div',{"class": "base-card"})
    job_id = base_card_div.get("data-entity-urn").split(":")[3]
    print(job_id)
    id_list.append(job_id)
 

4120822691
4137036696
4117827042
4138979847
4120823541
4122419074
4120142314
4133976072
4136862223
4138308800


In [4]:
print(len(id_list))

10


In [5]:
job_list=[]

for job_id in id_list:
    job_url=f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    job_response = requests.get(job_url)
    job_soup=BeautifulSoup(job_response.text,"html.parser")
    job_post={}
    job_post["company_name"]=job_soup.find("a",{"class":"topcard__org-name-link topcard__flavor--black-link"}).text.strip()
    job_post["location"]=job_soup.find("span",{"class":"topcard__flavor topcard__flavor--bullet"}).text.strip()
    job_post["post_date"]=job_soup.find("span",{"class":"posted-time-ago__text topcard__flavor--metadata"}).text.strip()
    job_post["job_description"]=job_soup.find("div",{"class":"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden"}).get_text(separator=' ').strip()

    job_list.append(job_post)


In [7]:
df=pd.DataFrame(job_list)
df.head(3)

Unnamed: 0,company_name,location,post_date,job_description
0,Meta,"London, England, United Kingdom",1 month ago,Want to build new features and improve existin...
1,Sinara,"London, England, United Kingdom",1 week ago,Job Description Sinara is currently looking fo...
2,Two Sigma,"London, England, United Kingdom",3 weeks ago,Position Summary Two Sigma is a financial scie...


## 2. Mapping out list of jobs in terms of roles, languages and tech

In [12]:
divisions=["cyber security","cloud","data analyst","full stack","frontend","backend","QA/Test","DevOps","Web","Mobile","System Administrator","Quant","AI"]
def numbers_scrapper(context):
    context = context.replace(" ", "%20")
    jobs_url = f"https://www.linkedin.com/jobs/search?keywords={context}&location=United%20Kingdom&geoId=101165590&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
    response = requests.get(jobs_url)
    job_soup = BeautifulSoup(response.text, 'html.parser')
    job_number = job_soup.find('span',{'class':"results-context-header__job-count"}).text.strip()
    job_number = job_number.replace('+',"")
    return job_number


In [22]:
role_number = {}
for div in divisions:
    role_number[div]=numbers_scrapper(div).replace(",","")
    role_number[div]=int(role_number[div])

In [23]:
sorted_items = sorted(role_number.items(), key=lambda item: item[1])
labels, values = zip(*sorted_items)  # Unpack sorted data

# Create a horizontal bar chart
fig = go.Figure(go.Bar(
    y=labels,  # Labels on the Y-axis
    x=values,  # Bar heights on the X-axis
    orientation='h',  # Horizontal bars
    marker=dict(color="steelblue")  # Bar color
))

# Remove background and grid lines
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    xaxis=dict(
        showgrid=False,  # Remove grid lines from X-axis
        range=[0, max(values) * 1.1]  # Adjust scale with some padding
    ),
    yaxis=dict(showgrid=False),  # Remove grid lines from Y-axis
    title="List of jobs available for such role",
)

# Show the figure
fig.show()

In [13]:
languages=["JavaScript","TypeScript","HTML/CSS","SQL","Python","Java","C++","C#","Ruby","Kotlin","Go","Objective-C","Flutter","Swift","Scala","PHP","R"]
lang_number = {}
for lang in languages:
    lang_number[lang]=numbers_scrapper(lang).replace(",","")
    lang_number[lang]=int(lang_number[lang])

In [20]:
sorted_lang = sorted(lang_number.items(), key=lambda item: item[1])
labels, values = zip(*sorted_lang)  # Unpack sorted data

# Create a horizontal bar chart
fig = go.Figure(go.Bar(
    y=labels,  # Labels on the Y-axis
    x=values,  # Bar heights on the X-axis
    orientation='h',  # Horizontal bars
    marker=dict(color="steelblue")  # Bar color
))

# Remove background and grid lines
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    xaxis=dict(
        showgrid=False,  # Remove grid lines from X-axis
        range=[0, max(values) * 1.1]  # Adjust scale with some padding
    ),
    yaxis=dict(showgrid=False),  # Remove grid lines from Y-axis
    title="List of jobs related to the Language",
)

# Show the figure
fig.show()

In [17]:
techs=["React","Next.js","LLM","AI","CMS","Wordpress","AWS","Node.js","Angular","Redux","Django","Flask",".NET","Spring","Vue","Express.js"]
tech_number = {}
for tech in techs:
    tech_number[tech]=numbers_scrapper(tech).replace(",","")
    tech_number[tech]=int(tech_number[tech])

In [19]:
sorted_tech = sorted(tech_number.items(), key=lambda item: item[1])
labels, values = zip(*sorted_tech)  # Unpack sorted data

# Create a horizontal bar chart
fig = go.Figure(go.Bar(
    y=labels,  # Labels on the Y-axis
    x=values,  # Bar heights on the X-axis
    orientation='h',  # Horizontal bars
    marker=dict(color="steelblue")  # Bar color
))

# Remove background and grid lines
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    xaxis=dict(
        showgrid=False,  # Remove grid lines from X-axis
        range=[0, max(values) * 1.1]  # Adjust scale with some padding
    ),
    yaxis=dict(showgrid=False),  # Remove grid lines from Y-axis
    title="List of jobs related to the technology",
)

# Show the figure
fig.show()