In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_country(location):
    # Assuming the country is always the last part of the location string
    return location.split(',')[-1].strip()

def extract_city(location):
    # Assuming the city is the part before the last comma
    parts = location.split(',')
    return ', '.join(parts[:-1]).strip() if len(parts) > 1 else parts[0].strip()

def get_job_description(job_url):
    response = requests.get(job_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        description_tag = soup.find('div', class_='wpjb-text')
        if description_tag:
            return description_tag.text.strip()
    return ""

def scrape_jobs(url):
    jobs_list = []
    while url:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            jobs = soup.find_all('div', class_='wpjb-grid-row')
            for job in jobs:
                title_tag = job.find('a', class_='wpjb-job_title wpjb-title')
                company_tag = job.find('span', class_='wpjb-sub wpjb-company_name')
                location_tag = job.find('span', class_='wpjb-glyphs wpjb-icon-location')
                type_tag = job.find('span', class_='wpjb-job_type wpjb-sub-title')

                if title_tag and company_tag and location_tag and type_tag:
                    title = title_tag.text.strip()
                    company = company_tag.text.strip()
                    locations = location_tag.text.strip().replace('/', ',').split(',')
                    job_type = type_tag.text.strip()
                    job_url = title_tag['href']
                    description = get_job_description(job_url)

                    country = extract_country(location_tag.text.strip())
                    for location in locations:
                        city = extract_city(location.strip())
                        job_details = {
                            "title": title,
                            "company": company,
                            "city": city,
                            "country": country,
                            "type": job_type,
                            "description": description
                        }
                        jobs_list.append(job_details)

            next_page_tag = soup.find('a', class_='next page-numbers')
            url = next_page_tag['href'] if next_page_tag else None
        else:
            break
    return jobs_list

start_url = 'https://womenindata.co.uk/diverse-data-jobs/'
jobs = scrape_jobs(start_url)

# Save to CSV
df = pd.DataFrame(jobs)



In [9]:
df.head(20)

Unnamed: 0,title,company,city,country,type,description
0,Principal - Online Safety Market Intelligence,Ofcom,London,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
1,Principal - Online Safety Market Intelligence,Ofcom,Manchester,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
2,Principal - Online Safety Market Intelligence,Ofcom,Edinburgh,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
3,Principal - Online Safety Market Intelligence,Ofcom,United Kingdom,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi..."
4,Data Innovation Analyst,BP,London,United Kingdom,Full-time,Locations: United Kingdom - London\nTime type:...
5,Data Innovation Analyst,BP,United Kingdom,United Kingdom,Full-time,Locations: United Kingdom - London\nTime type:...
6,Data Scientist II,LexisNexis,London,United Kingdom,Full-time,Would you like to work with data?\nWould you l...
7,Data Scientist II,LexisNexis,United Kingdom,United Kingdom,Full-time,Would you like to work with data?\nWould you l...
8,Data Enabled Change Advisor,Bae Systems,Frimley,United Kingdom,Full-time,Location: Frimley. We offer a range of hybrid ...
9,Data Enabled Change Advisor,Bae Systems,United Kingdom,United Kingdom,Full-time,Location: Frimley. We offer a range of hybrid ...


In [44]:
#ex_df= pd.read_csv('jobs.csv')

In [10]:
df = df.loc[df.iloc[::-1].duplicated(subset='title', keep='first')[::-1] == True]

In [12]:
df.to_csv('jobs.csv', index=False)

In [13]:
keywords = [
    'Data Analysis', 'Data Visualization', 'SQL', 'Python', 'Machine Learning',
    'Statistical Analysis', 'Data Mining', 'Big Data', 'R Programming',
    'Data Warehousing', 'ETL (Extract, Transform, Load)', 'Data Cleaning',
    'Predictive Analytics', 'Business Intelligence', 'Data Governance',
    'Data Modeling', 'Hadoop', 'Tableau', 'Power BI', 'Data Architecture'
]

In [14]:
def find_keywords(description):
    found_keywords = [keyword for keyword in keywords if keyword.lower() in description.lower()]
    return ', '.join(found_keywords)

df['found_keywords'] = df['description'].apply(find_keywords)

In [15]:
df.head(20)

Unnamed: 0,title,company,city,country,type,description,found_keywords
0,Principal - Online Safety Market Intelligence,Ofcom,London,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi...",Data Analysis
1,Principal - Online Safety Market Intelligence,Ofcom,Manchester,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi...",Data Analysis
2,Principal - Online Safety Market Intelligence,Ofcom,Edinburgh,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi...",Data Analysis
4,Data Innovation Analyst,BP,London,United Kingdom,Full-time,Locations: United Kingdom - London\nTime type:...,"SQL, Machine Learning, Data Governance, Power BI"
6,Data Scientist II,LexisNexis,London,United Kingdom,Full-time,Would you like to work with data?\nWould you l...,"Data Analysis, SQL, Python"
8,Data Enabled Change Advisor,Bae Systems,Frimley,United Kingdom,Full-time,Location: Frimley. We offer a range of hybrid ...,Machine Learning
10,Market Intelligence Analyst,Ofcom,London,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi...","Data Analysis, Python, Power BI"
11,Market Intelligence Analyst,Ofcom,Manchester,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi...","Data Analysis, Python, Power BI"
12,Market Intelligence Analyst,Ofcom,Edinburgh,United Kingdom,Full-time,"Location: London, Manchester, Edinburgh\nClosi...","Data Analysis, Python, Power BI"
14,Research Analyst,Ofcom,London,United Kingdom,Full-time,"Location: London, Manchester, Cardiff\nClosing...",
