# **Web Scraping and Analysis of Job Postings**

**Loading trhe html page**

In [2]:
from bs4 import BeautifulSoup
import pandas as pd

# Load the HTML file
with open('job_page.html', 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')
print("Loaded successfully.")

Loaded successfully.


**Extracting the data**

In [30]:
from bs4 import BeautifulSoup
import pandas as pd
import re

# Load HTML file
html_path = "job_page.html"
with open(html_path, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# Find job cards
job_cards = soup.find_all('div', class_='job_seen_beacon')
print("✅ Job cards found:", len(job_cards))

# Extract fields
refined_jobs = []

for card in job_cards:
    text_blocks = list(card.stripped_strings)

    job_title = text_blocks[0] if len(text_blocks) > 0 else None
    company = text_blocks[1] if len(text_blocks) > 1 else None
    location = text_blocks[2] if len(text_blocks) > 2 else None

    posted = None
    salary = None

    for text in text_blocks:
        if re.search(r'\brs\b', text.lower()):
            salary = text
        elif re.search(r'(just posted|\d+\+?\s?(day|hour|minute)s?)', text.lower()):
            posted = text

    refined_jobs.append({
        'Job Title': job_title,
        'Company': company,
        'Location': location,
        'Posted': posted,
        'Salary': salary
    })

# Create DataFrame
refined_df = pd.DataFrame(refined_jobs)
print("📦 Extracted shape:", refined_df.shape)
refined_df.head()


✅ Job cards found: 15
📦 Extracted shape: (15, 5)


Unnamed: 0,Job Title,Company,Location,Posted,Salary
0,Data Analyst,HR Ways,Lahore,Visited 2 minutes ago,"Rs 170,000 - Rs 230,000 a month"
1,Junior Data Analyst,King Revolution Inc,Lahore Ismail Nagar,,"Rs 50,000 - Rs 60,000 a month"
2,Junior Data Analyst,Raaziq International PVT LTD,Lahore,,"Rs 50,000 - Rs 55,000 a month"
3,DATA ANALYST,Dignosco,Lahore,,
4,Data Analyst,Mashkraft,Lahore,,"From Rs 35,000 a month"


**Analyzing the html file**

In [25]:
import matplotlib.pyplot as plt
import pandas as pd
import re

# Top job titles
print("🔹 Top Job Titles:")
print(refined_df['Job Title'].value_counts().head(10), '\n')

# Top companies
print("🔹 Top Hiring Companies:")
print(refined_df['Company'].value_counts().head(10), '\n')

# Top locations
print("🔹 Top Locations:")
print(refined_df['Location'].value_counts().head(10), '\n')

# Clean salary column
salary_data = refined_df[['Salary']].dropna()
salary_data = salary_data[salary_data['Salary'].str.contains('Rs')]

def extract_salary_range(s):
    match = re.findall(r'Rs\s*([\d,]+)', s)
    if match:
        nums = [int(x.replace(',', '')) for x in match]
        return nums[0], nums[1] if len(nums) > 1 else nums[0]
    return None, None

# Create new columns
salary_data[['Min Salary', 'Max Salary']] = salary_data['Salary'].apply(lambda x: pd.Series(extract_salary_range(x)))
salary_data['Average Salary'] = salary_data[['Min Salary', 'Max Salary']].mean(axis=1)

# View sample salaries
print("🔹 Sample Average Salaries:")
print(salary_data[['Salary', 'Min Salary', 'Max Salary', 'Average Salary']].head())


🔹 Top Job Titles:
Job Title
Data Analyst                                  3
Junior Data Analyst                           2
DATA ANALYST                                  1
Marketing Data Analyst ( Fresh Graduates )    1
Financial Analyst - Remote                    1
Junior Marketing Data Analyst & Hygienist     1
BI Quantitative Analyst                       1
Book Keeping/ Data Entry Operator             1
Junior Financial Analyst                      1
Product Experience Analyst - Lahore           1
Name: count, dtype: int64 

🔹 Top Hiring Companies:
Company
Contour Software                3
HR Ways                         2
Raaziq International PVT LTD    1
King Revolution Inc             1
Dignosco                        1
Mashkraft                       1
Fulcrum Pvt Ltd                 1
CureMD Healthcare               1
Caramel Tech Studios            1
One Machine Software            1
Name: count, dtype: int64 

🔹 Top Locations:
Location
Lahore                 14
Lahore Ismai