In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# --- CONFIGURATION ---
search_term = "Data Analyst"
pages_to_scrape = 5
output_file = 'wuzzuf_data_analyst_jobs_v4.csv'

# Anti-Blocking Headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
}

data_list = []
print(f"üöÄ Starting SMART Scraper V4 for '{search_term}'...")

for page in range(pages_to_scrape):
    url = f"https://wuzzuf.net/search/jobs/?a=hpb&q={search_term.replace(' ', '%20')}&start={page}"

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # 1. Find all Job Titles (H2 tags are usually stable)
        titles = soup.find_all('h2')

        print(f"üìÑ Page {page}: Processing {len(titles)} jobs...")

        for h2 in titles:
            try:
                # --- SMART EXTRACTION STRATEGY ---

                # 1. Title & Link
                link_tag = h2.find('a')
                if not link_tag: continue

                title = link_tag.get_text(strip=True)
                link = link_tag['href']
                if not link.startswith('http'):
                    link = "https://wuzzuf.net" + link  # Fix relative links

                # 2. Get the Container (The Card)
                # We go up to find the parent container that holds everything
                # Usually h2 -> div -> div (Card)
                card = h2.find_parent('div')
                if card:
                    card = card.find_parent('div') # Go one level higher just in case

                if not card: continue

                # 3. Company (Usually the 2nd link in the card, after the title)
                all_links = card.find_all('a')
                if len(all_links) > 1:
                    company = all_links[1].get_text(strip=True).replace('-', '').strip()
                else:
                    company = "Confidential"

                # 4. Location (Look for span tags)
                spans = card.find_all('span')
                if len(spans) > 0:
                    location = spans[0].get_text(strip=True)
                else:
                    location = "Unknown"

                # 5. Skills (Get text from the bottom area)
                # We look for the last div or just collect text that looks like skills
                # Heuristic: Skills are usually separated by dots or newlines at the end
                card_text = card.get_text(separator='|', strip=True)
                # Split by '|' and take the last few parts as skills roughly
                skills_rough = card_text.split('|')[-4:]
                skills = ", ".join(skills_rough).replace('Apply', '').strip()

                data_list.append({
                    'Job Title': title,
                    'Company Name': company,
                    'Location': location,
                    'Skills/Description': skills,
                    'Job Link': link
                })

            except Exception as e:
                continue

        time.sleep(1) # Be nice to the server

    except Exception as e:
        print(f"‚ùå Error on page {page}: {e}")

# --- SAVE & CHECK ---
df = pd.DataFrame(data_list)
print("-" * 30)
print(f"‚úÖ DONE! Total Jobs: {len(df)}")
# Show a sample to check if columns are filled
print(df[['Job Title', 'Company Name', 'Location']].head())

df.to_csv(output_file, index=False)

In [None]:
import pandas as pd
import plotly.express as px
from collections import Counter

# 1. Load the Data
try:
    df = pd.read_csv('wuzzuf_data_analyst_jobs_v4.csv')
    print(f"‚úÖ Data Loaded Successfully: {len(df)} jobs.")
except FileNotFoundError:
    print("‚ùå Error: File not found. Please upload 'wuzzuf_data_analyst_jobs_v4.csv'.")

# --- DATA PROCESSING ---

# A. Experience Level (Derived from Job Title)
def get_experience(title):
    title = str(title).lower()
    if 'senior' in title: return 'Senior'
    elif 'junior' in title or 'entry' in title: return 'Junior/Entry'
    elif 'manager' in title or 'lead' in title or 'head' in title: return 'Manager/Lead'
    else: return 'Mid-Level'

df['Experience_Level'] = df['Job Title'].apply(get_experience)

# B. Location Cleaning
# Take the first part of the location (e.g., "New Cairo, Cairo" -> "New Cairo")
df['Clean_Location'] = df['Location'].astype(str).apply(lambda x: x.split(',')[0].strip())

# C. Skills Extraction (The Magic Part ‚ú®)
# We will search for these keywords in the 'Skills/Description' column
target_skills = ['Python', 'SQL', 'Excel', 'Power BI', 'Tableau',
                 'Machine Learning', 'R', 'Big Data', 'Spark', 'AWS', 'Azure',
                 'Data Modeling', 'Statistics', 'Visualization']

found_skills = []
for text in df['Skills/Description'].fillna('').astype(str):
    for skill in target_skills:
        # Check if the skill exists in the text (case insensitive)
        if skill.lower() in text.lower():
            found_skills.append(skill)

# Create a DataFrame for Skills
skills_counts = pd.DataFrame(Counter(found_skills).most_common(10), columns=['Skill', 'Count'])

# --- VISUALIZATION (Dark Theme for Professional Look) ---

# Chart 1: Top 10 In-Demand Skills (Bar Chart)
fig1 = px.bar(skills_counts.sort_values('Count', ascending=True),
              x='Count', y='Skill', orientation='h',
              title='üî• Top 10 In-Demand Data Analyst Skills in Egypt',
              text='Count', color='Count', template='plotly_dark')
fig1.update_layout(xaxis_title="Number of Jobs", yaxis_title="Skill")
fig1.show()

# Chart 2: Experience Level Required (Donut Chart)
exp_counts = df['Experience_Level'].value_counts().reset_index()
exp_counts.columns = ['Level', 'Count']
fig2 = px.pie(exp_counts, values='Count', names='Level', hole=0.5,
              title='üéì Job Market by Experience Level',
              color_discrete_sequence=px.colors.sequential.RdBu,
              template='plotly_dark')
fig2.show()

# Chart 3: Top Locations (Bar Chart)
loc_counts = df['Clean_Location'].value_counts().head(7).reset_index()
loc_counts.columns = ['Location', 'Count']
fig3 = px.bar(loc_counts, x='Location', y='Count',
              title='üìç Where are the Jobs Located?',
              color='Count', template='plotly_dark')
fig3.show()

In [None]:
import pandas as pd
import plotly.express as px
from collections import Counter

# 1. Load the V4 Data
try:
    df = pd.read_csv('wuzzuf_data_analyst_jobs_v4.csv')
    print(f"‚úÖ Data Loaded: {len(df)} jobs.")
except FileNotFoundError:
    print("‚ùå Error: File not found. Run the V4 scraper first.")

# --- DATA CLEANING ---
df['Clean_Location'] = df['Location'].astype(str).apply(lambda x: x.split(',')[0].strip())

# Experience Logic
def get_exp(title):
    t = str(title).lower()
    if 'senior' in t: return 'Senior'
    elif 'junior' in t or 'entry' in t: return 'Junior/Entry'
    elif 'manager' in t or 'lead' in t: return 'Manager/Lead'
    return 'Mid-Level'
df['Experience'] = df['Job Title'].apply(get_exp)

# --- SKILLS EXTRACTION ---
# Keywords we care about
keywords = ['Python', 'SQL', 'Excel', 'Power BI', 'Tableau', 'Machine Learning',
            'R', 'Big Data', 'Spark', 'AWS', 'Azure', 'NoSQL']

all_skills = []
for text in df['Skills/Description'].fillna('').astype(str):
    for word in keywords:
        if word.lower() in text.lower():
            all_skills.append(word)

# --- VISUALIZATION ---
if all_skills:
    # 1. Top Skills
    skills_df = pd.DataFrame(Counter(all_skills).most_common(10), columns=['Skill', 'Count'])
    fig1 = px.bar(skills_df, x='Count', y='Skill', orientation='h',
                  title='üî• Top Data Analyst Skills in Egypt',
                  text='Count', color='Count', template='plotly_dark')
    fig1.show()

    # 2. Experience Level
    fig2 = px.pie(df, names='Experience', title='üéì Experience Level Required',
                  hole=0.4, template='plotly_dark')
    fig2.show()
else:
    print("‚ö†Ô∏è No skills found. The scraper might need adjustment.")