Nepal Digital Jobs Data Scraper 

In [1]:
# Nepal Digital Jobs Data Scraper 
# This notebook creates the CSV file 

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import random

print("Libraries imported successfully!")

# Create the exact data as requested
data = {
    'District': ['Kathmandu', 'Lalitpur', 'Bhaktapur', 'Kaski', 'Rupandehi', 
                 'Banke', 'Dang', 'Nawalparasi', 'Sunsari', 'Morang', 
                 'Jhapa', 'Kailali', 'Kanchanpur', 'Dadeldhura', 'Dolpa'],
    'Digital_Job_Postings': [1245, 845, 420, 380, 185, 125, 145, 95, 220, 245, 265, 85, 78, 25, 12],
    'Avg_Salary_NPR': [85000, 82000, 78000, 65000, 55000, 52000, 53000, 51000, 
                       58000, 59000, 60000, 49000, 48500, 42000, 38000],
    'Top_Skills_In_Demand': [
        "Python, Data Analysis, SQL, Cloud",
        "Web Development, JavaScript, React",
        "Digital Marketing, SEO, Content Writing",
        "IT Support, Networking, Cybersecurity",
        "E-commerce, Digital Sales, Marketing",
        "Computer Basics, Office Software, Accounting",
        "Graphics Design, Video Editing, Social Media",
        "Mobile Banking, Customer Support, Sales",
        "IT Teaching, Computer Operations, English",
        "Data Entry, Administration, Documentation",
        "Software Testing, QA, Technical Writing",
        "Field Technician, Mobile Repair, Sales",
        "Agriculture Tech, Digital Monitoring",
        "Community ICT, Basic Computer Training",
        "Telemedicine Support, Remote Assistance"
    ]
}

# Create DataFrame
jobs_df = pd.DataFrame(data)

print("DataFrame created successfully!")
print(jobs_df.head())

Libraries imported successfully!
DataFrame created successfully!
    District  Digital_Job_Postings  Avg_Salary_NPR  \
0  Kathmandu                  1245           85000   
1   Lalitpur                   845           82000   
2  Bhaktapur                   420           78000   
3      Kaski                   380           65000   
4  Rupandehi                   185           55000   

                      Top_Skills_In_Demand  
0        Python, Data Analysis, SQL, Cloud  
1       Web Development, JavaScript, React  
2  Digital Marketing, SEO, Content Writing  
3    IT Support, Networking, Cybersecurity  
4     E-commerce, Digital Sales, Marketing  


In [2]:
# Simple Web Scraping 
# This part shows basic web scraping concepts without complex code

def simple_web_scraping_demo():
    """
    A simple demonstration of web scraping concepts
    This is optional and just for learning purposes
    """
    print("Simple Web Scraping Demonstration")
    print("=" * 50)
    
    
    example_url = "https://merojob.com/search/?q=it&location="
    
    try:
        # Simulate making a web request
        print("1. Making request to job portal...")
        time.sleep(1)
        
        # Simulate parsing HTML content
        print("2. Parsing HTML content...")
        time.sleep(1)
        
        # Simulate extracting job information
        print("3. Extracting job data...")
        time.sleep(1)
        
        # Show what we would typically look for in HTML
        print("\n Typically we would look for:")
        print("   - Job titles (HTML: <h2 class='job-title'>)")
        print("   - Company names (HTML: <div class='company'>)")
        print("   - Locations (HTML: <span class='location'>)")
        print("   - Skills required (HTML: <div class='skills'>)")
        
        print("\n Web scraping concepts demonstrated!")
        
    except Exception as e:
        print(f" Error in web scraping: {e}")
        print("This is normal - websites often have anti-scraping measures")

# Run the demo if you want to see web scraping concepts
# simple_web_scraping_demo()

In [3]:
# Save the data to CSV file
output_file = "nepal_digital_jobs.csv"
jobs_df.to_csv(output_file, index=False)

print(f" Data saved to '{output_file}' successfully!")
print(" File preview:")
print(jobs_df)

 Data saved to 'nepal_digital_jobs.csv' successfully!
 File preview:
       District  Digital_Job_Postings  Avg_Salary_NPR  \
0     Kathmandu                  1245           85000   
1      Lalitpur                   845           82000   
2     Bhaktapur                   420           78000   
3         Kaski                   380           65000   
4     Rupandehi                   185           55000   
5         Banke                   125           52000   
6          Dang                   145           53000   
7   Nawalparasi                    95           51000   
8       Sunsari                   220           58000   
9        Morang                   245           59000   
10        Jhapa                   265           60000   
11      Kailali                    85           49000   
12   Kanchanpur                    78           48500   
13   Dadeldhura                    25           42000   
14        Dolpa                    12           38000   

                  

In [4]:
# Data Validation - Check if we have the exact required data
required_districts = ['Kathmandu', 'Lalitpur', 'Bhaktapur', 'Kaski', 'Rupandehi', 
                     'Banke', 'Dang', 'Nawalparasi', 'Sunsari', 'Morang', 
                     'Jhapa', 'Kailali', 'Kanchanpur', 'Dadeldhura', 'Dolpa']

print(" Validating data...")
print("=" * 40)

# Check if all required districts are present
missing_districts = set(required_districts) - set(jobs_df['District'].tolist())
if not missing_districts:
    print("All 15 districts are included")
else:
    print(f" Missing districts: {missing_districts}")

# Check data types
print(f"Digital_Job_Postings data type: {jobs_df['Digital_Job_Postings'].dtype}")
print(f" Avg_Salary_NPR data type: {jobs_df['Avg_Salary_NPR'].dtype}")

# Verify some key values
print(f"Kathmandu jobs: {jobs_df[jobs_df['District'] == 'Kathmandu']['Digital_Job_Postings'].values[0]}")
print(f" Dolpa jobs: {jobs_df[jobs_df['District'] == 'Dolpa']['Digital_Job_Postings'].values[0]}")
print(f" Kathmandu salary: NPR {jobs_df[jobs_df['District'] == 'Kathmandu']['Avg_Salary_NPR'].values[0]:,}")
print(f"Dolpa salary: NPR {jobs_df[jobs_df['District'] == 'Dolpa']['Avg_Salary_NPR'].values[0]:,}")

print("=" * 40)
print("Data validation complete! The CSV file is ready for Power BI.")

 Validating data...
All 15 districts are included
Digital_Job_Postings data type: int64
 Avg_Salary_NPR data type: int64
Kathmandu jobs: 1245
 Dolpa jobs: 12
 Kathmandu salary: NPR 85,000
Dolpa salary: NPR 38,000
Data validation complete! The CSV file is ready for Power BI.


In [5]:
# Prepare for Power BI - Show data types and structure
print(" Data Preparation for Power BI")
print("=" * 50)
print("Data Types:")
print(jobs_df.dtypes)
print("\nFirst 5 rows:")
print(jobs_df.head())
print("\nLast 5 rows:")
print(jobs_df.tail())

print("\n When connecting to Power BI:")
print("   - Use 'Import' mode for this CSV file")
print("   - District column will be used for mapping")
print("   - Digital_Job_Postings for size/quantity visuals")
print("   - Avg_Salary_NPR for color intensity in maps")
print("   - Top_Skills_In_Demand for tooltips and details")

 Data Preparation for Power BI
Data Types:
District                object
Digital_Job_Postings     int64
Avg_Salary_NPR           int64
Top_Skills_In_Demand    object
dtype: object

First 5 rows:
    District  Digital_Job_Postings  Avg_Salary_NPR  \
0  Kathmandu                  1245           85000   
1   Lalitpur                   845           82000   
2  Bhaktapur                   420           78000   
3      Kaski                   380           65000   
4  Rupandehi                   185           55000   

                      Top_Skills_In_Demand  
0        Python, Data Analysis, SQL, Cloud  
1       Web Development, JavaScript, React  
2  Digital Marketing, SEO, Content Writing  
3    IT Support, Networking, Cybersecurity  
4     E-commerce, Digital Sales, Marketing  

Last 5 rows:
      District  Digital_Job_Postings  Avg_Salary_NPR  \
10       Jhapa                   265           60000   
11     Kailali                    85           49000   
12  Kanchanpur             

In [6]:
# Final step: Show the exact CSV content
print(" EXACT CSV FILE CONTENT:")
print("=" * 60)
print("District,Digital_Job_Postings,Avg_Salary_NPR,Top_Skills_In_Demand")

for _, row in jobs_df.iterrows():
    print(f"{row['District']},{row['Digital_Job_Postings']},{row['Avg_Salary_NPR']},\"{row['Top_Skills_In_Demand']}\"")

print("=" * 60)
print(" This matches your required format exactly!")


 EXACT CSV FILE CONTENT:
District,Digital_Job_Postings,Avg_Salary_NPR,Top_Skills_In_Demand
Kathmandu,1245,85000,"Python, Data Analysis, SQL, Cloud"
Lalitpur,845,82000,"Web Development, JavaScript, React"
Bhaktapur,420,78000,"Digital Marketing, SEO, Content Writing"
Kaski,380,65000,"IT Support, Networking, Cybersecurity"
Rupandehi,185,55000,"E-commerce, Digital Sales, Marketing"
Banke,125,52000,"Computer Basics, Office Software, Accounting"
Dang,145,53000,"Graphics Design, Video Editing, Social Media"
Nawalparasi,95,51000,"Mobile Banking, Customer Support, Sales"
Sunsari,220,58000,"IT Teaching, Computer Operations, English"
Morang,245,59000,"Data Entry, Administration, Documentation"
Jhapa,265,60000,"Software Testing, QA, Technical Writing"
Kailali,85,49000,"Field Technician, Mobile Repair, Sales"
Kanchanpur,78,48500,"Agriculture Tech, Digital Monitoring"
Dadeldhura,25,42000,"Community ICT, Basic Computer Training"
Dolpa,12,38000,"Telemedicine Support, Remote Assistance"
 This matches y