In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_country(location):
    return location.split(',')[-1].strip()

def extract_city(location):
    parts = location.split(',')
    return ', '.join(parts[:-1]).strip() if len(parts) > 1 else parts[0].strip()

def get_job_description(job_url):
    response = requests.get(job_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        description_tag = soup.find('div', class_='wpjb-job_content')
        if description_tag:
            return description_tag.text.strip()
    return ""

def scrape_jobs(url):
    jobs_list = []
    while url:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            jobs = soup.find_all('div', class_='wpjb-grid-row')
            for job in jobs:
                title_tag = job.find('a', class_='wpjb-job_title wpjb-title')
                company_tag = job.find('span', class_='wpjb-sub wpjb-company_name')
                location_tag = job.find('span', class_='wpjb-glyphs wpjb-icon-location')
                type_tag = job.find('span', class_='wpjb-job_type wpjb-sub-title')

                if title_tag and company_tag and location_tag and type_tag:
                    title = title_tag.text.strip()
                    company = company_tag.text.strip()
                    locations = location_tag.text.strip().replace('/', ',').split(',')
                    job_type = type_tag.text.strip()
                    job_url = title_tag['href']
                    description = get_job_description(job_url)

                    country = extract_country(location_tag.text.strip())
                    for location in locations:
                        city = location.strip()
                        job_details = {
                            "title": title,
                            "company": company,
                            "city": city,
                            "country": country,
                            "type": job_type,
                            "description": description
                        }
                        jobs_list.append(job_details)

            next_page_tag = soup.find('a', class_='next page-numbers')
            url = next_page_tag['href'] if next_page_tag else None
        else:
            break
    return jobs_list

start_url = 'https://womenindata.co.uk/diverse-data-jobs/'
jobs = scrape_jobs(start_url)

# Save to CSV
df = pd.DataFrame(jobs)
df.to_csv('jobs.csv', index=False)

In [15]:
print(df['description'].loc[1])

Location: London, Manchester, Edinburgh
Closing Date: 30/05/2024
Group: Strategy and Research Group
Management Level: Principal
Job Type: Permanent
 
Job Description:
Please note that this role will close at 00:01 on Thursday 30th May, and therefore we advise getting your application in no later than midnight on Wednesday 29th May. 
 
How to Apply 
When applying, please submit a covering letter of no more than two pages in length outlining your suitability and experience for the role. You are able to upload the covering letter at the same point as your CV.
 
Team overview
Ofcom’s mission is to make communications work for everyone. We regulate broadband and mobile services, postal services, TV and radio, and the airwaves over which wireless devices operate. Under the 2023 Online Safety Act we took on duties as the UK’s online safety regulator.
The Research and Intelligence (R&I) team is responsible for providing a deep understanding of market developments and the outcomes for consumers

In [18]:
df.head(20)

Unnamed: 0,title,company,city,country,type,description
0,Principal - Online Safety Market Intelligence,Ofcom,London,United Kingdom,Full-time,
1,Principal - Online Safety Market Intelligence,Ofcom,Manchester,United Kingdom,Full-time,
2,Principal - Online Safety Market Intelligence,Ofcom,Edinburgh,United Kingdom,Full-time,
3,Principal - Online Safety Market Intelligence,Ofcom,United Kingdom,United Kingdom,Full-time,
4,Data Innovation Analyst,BP,London,United Kingdom,Full-time,
5,Data Innovation Analyst,BP,United Kingdom,United Kingdom,Full-time,
6,Data Scientist II,LexisNexis,London,United Kingdom,Full-time,
7,Data Scientist II,LexisNexis,United Kingdom,United Kingdom,Full-time,
8,Data Enabled Change Advisor,Bae Systems,Frimley,United Kingdom,Full-time,
9,Data Enabled Change Advisor,Bae Systems,United Kingdom,United Kingdom,Full-time,


In [12]:
df = df.drop_duplicates()

In [8]:
df.head(20)

Unnamed: 0,title,company,city,country,type,description
0,Principal - Online Safety Market Intelligence,Ofcom,London,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
1,Principal - Online Safety Market Intelligence,Ofcom,Manchester,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
2,Principal - Online Safety Market Intelligence,Ofcom,Edinburgh,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
3,Principal - Online Safety Market Intelligence,Ofcom,United Kingdom,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
4,Data Innovation Analyst,BP,London,United Kingdom,Full-time,Locations: United Kingdom - London\nTime type:...
5,Data Innovation Analyst,BP,United Kingdom,United Kingdom,Full-time,Locations: United Kingdom - London\nTime type:...
6,Data Scientist II,LexisNexis,London,United Kingdom,Full-time,Would you like to work with data?\nWould you l...
7,Data Scientist II,LexisNexis,United Kingdom,United Kingdom,Full-time,Would you like to work with data?\nWould you l...
8,Data Enabled Change Advisor,Bae Systems,Frimley,United Kingdom,Full-time,Location: Frimley. We offer a range of hybrid ...
9,Data Enabled Change Advisor,Bae Systems,United Kingdom,United Kingdom,Full-time,Location: Frimley. We offer a range of hybrid ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        125 non-null    object
 1   company      125 non-null    object
 2   city         125 non-null    object
 3   country      125 non-null    object
 4   type         125 non-null    object
 5   description  125 non-null    object
dtypes: object(6)
memory usage: 6.0+ KB


In [7]:
df.describe

<bound method NDFrame.describe of                                              title      company  \
0    Principal - Online Safety Market Intelligence        Ofcom   
1    Principal - Online Safety Market Intelligence        Ofcom   
2    Principal - Online Safety Market Intelligence        Ofcom   
3    Principal - Online Safety Market Intelligence        Ofcom   
4                          Data Innovation Analyst           BP   
..                                             ...          ...   
120     Chief Data Office -Data Governance Analyst    JP Morgan   
121                          Senior Data Scientist      Admiral   
122                          Senior Data Scientist      Admiral   
123                                 Data Architect  Sainsbury's   
124                                 Data Architect  Sainsbury's   

               city         country       type  \
0            London  United Kingdom  Full-time   
1        Manchester  United Kingdom  Full-time   
2         E