**ProConnect360** is a web scraping solution designed to gather detailed profiles of professors from universities and professionals from job platforms like Indeed. It provides insights into their roles, expertise, and locations, storing the data in structured formats like CSV and Excel for analysis, fostering professional connections and academic research.

In [1]:
"""
Project: ProConnect360 - University Faculty Scraper

Description:
This script is designed to scrape faculty data from university websites, providing detailed insights
into professors' profiles. It gathers information such as names, contact details, job descriptions,
emails, and profile images. The data is processed and displayed for further analysis.

Features:
1. Extracts faculty profile data from the UMT website.
2. Scrapes details like Name, Contact, Job Description, Email, and Profile Image.
3. Identifies whether the faculty is HEC approved based on the job description.
4. Handles dynamic image URL extraction from individual faculty pages.
5. Outputs all scraped data in a structured format for easy review.

Modules Used:
- requests: For sending HTTP requests to fetch web pages.
- BeautifulSoup (bs4): For parsing HTML and extracting required information.

"""

import requests
from bs4 import BeautifulSoup

def extract_image_url(faculty_url):
    """Extracts the profile image URL from the faculty's individual page."""
    try:
        response = requests.get(faculty_url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # Try to find the image using the initial method
        img_tag = soup.find("img", id="ctl00_cphContent_imgProfile")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]

        # If not found, try to find it in the alternative div
        img_tag_alternate = soup.find("div", class_="col-md-3").find("img") if soup.find("div", class_="col-md-3") else None
        if img_tag_alternate and "src" in img_tag_alternate.attrs:
            return img_tag_alternate["src"]

        return "N/A"
    except Exception as e:
        print(f"Error fetching image for {faculty_url}: {e}")
        return "N/A"

# URL of the UMT faculty page
url = "https://www.umt.edu.pk/faculty.aspx"

# Headers (to prevent blocking of requests by the server)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

# Fetch the content of the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Find the table rows containing faculty data
rows = soup.find_all("tr")

# Loop through each row and extract the required data
for row in rows:
    name_cell = row.find("td", class_="person-name")
    contact_cell = row.find("td", class_="person-contact")
    job_description_cell = row.find("td", class_="job-description")
    email_cell = row.find("td", class_="person-email")

    if name_cell and job_description_cell and email_cell:
        name = name_cell.get_text(strip=True)
        contact = contact_cell.get_text(strip=True) if contact_cell else "N/A"
        contact = contact if len(contact) > 0 else "N/A"
        job_description = job_description_cell.get_text(strip=True)

        # Extract email from the title attribute of <a> tag
        email_tag = email_cell.find("a")
        email = email_tag["title"] if email_tag and "title" in email_tag.attrs else "N/A"

        # Check if job description contains "HEC" (case-insensitive)
        hec_approved = "TRUE" if "hec" in job_description.lower() else "FALSE"

        # Extract faculty page link
        faculty_link_tag = name_cell.find("a")
        faculty_page_url = faculty_link_tag["href"] if faculty_link_tag and "href" in faculty_link_tag.attrs else "N/A"
        # Create full URL if the href is relative
        if faculty_page_url.startswith('/'):
            faculty_page_url = "https://www.umt.edu.pk" + faculty_page_url

        # Extract image URL from the faculty's individual page
        image_url = extract_image_url(faculty_page_url)

        # Print or store the extracted data
        print(f"Name: {name}, Contact: {contact}, Job-Description: {job_description}, Email: {email}, HEC Approved: {hec_approved}, Photo Link: {image_url}")


Name: Aamir Shahzada Khan, Contact: 3835, Job-Description: Lecturer, Email: aamir.shahzada@umt.edu.pk, HEC Approved: FALSE, Photo Link: https://admin.umt.edu.pk/Media/UserProfile/636753931581232459123.JPG
Name: Aasma Nijabat, Contact: N/A, Job-Description: Lecturer, Email: aasma.nijabat@umt.edu.pk, HEC Approved: FALSE, Photo Link: https://admin.umt.edu.pk/Media/UserProfile/638494724332760590276.jpg
Name: Abbas Raza, Contact: N/A, Job-Description: Study leave, Lecturer cum Lab Engineer, Email: abbas.raza@umt.edu.pk, HEC Approved: FALSE, Photo Link: https://admin.umt.edu.pk/Media/UserProfile/637915951748091997809.jpg
Name: Abdul Basit, Contact: N/A, Job-Description: Principal Lecturer, Email: a.basit@umt.edu.pk, HEC Approved: FALSE, Photo Link: https://admin.umt.edu.pk/Media/UserProfile/638706417063055230305.jpg
Name: Abdul Ghafar, Contact: 3346, Job-Description: Lecturer, Email: abdul.ghafar@umt.edu.pk, HEC Approved: FALSE, Photo Link: https://admin.umt.edu.pk/Media/UserProfile/63820003

KeyboardInterrupt: 

In [2]:
"""
Project: JobScrape360 - Save Job Listings to CSV and Excel

Description:
This script automates the process of scraping Python job listings from Indeed Pakistan for Lahore.
It extracts details such as job titles, company names, locations, and job descriptions,
and saves the data into both a CSV file and an Excel workbook for analysis.

Features:
1. Automates job search for "Python Developer" in Lahore on Indeed.
2. Extracts comprehensive details for each job posting.
3. Saves the scraped data to a structured CSV file and an Excel workbook.
4. Implements pagination to scrape job listings across multiple pages.
5. Simulates human-like interaction with random delays to reduce detection.

Modules Used:
- Selenium: For browser automation and interaction with dynamic elements.
- random: To add randomized delays for human-like interactions.
- csv: To save scraped data in CSV format.
- openpyxl: To save data into an Excel workbook.
- time: For managing delays during scraping.

"""

import time
import random
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook

# Helper function to simulate human-like random delays
def random_sleep(min_time=2, max_time=5):
    """
    Introduces a randomized delay to mimic human interaction.

    Args:
        min_time (int): Minimum delay in seconds.
        max_time (int): Maximum delay in seconds.
    """
    time.sleep(random.uniform(min_time, max_time))

# Step 1: Set up the WebDriver (Chrome in this case)
# Using ChromeDriverManager to ensure the correct driver version is installed automatically.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.implicitly_wait(10)  # Set an implicit wait for locating elements

# Step 2: Create a CSV file and write the headers
csv_file = open('lahore_python_jobs.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Job Title', 'Company Name', 'Location', 'Job Description'])  # Add column headers

# Step 3: Create an Excel workbook and write the headers
wb = Workbook()
ws = wb.active
ws.title = "Python Jobs"
ws.append(['Job Title', 'Company Name', 'Location', 'Job Description'])  # Add column headers

# Step 4: Navigate to the Indeed Pakistan job search page
driver.get("https://pk.indeed.com/")
random_sleep(3, 6)

# Step 5: Enter the job title in the search box
job_search_box = driver.find_element(By.ID, "text-input-what")
for char in "Python Developer":  # Simulate typing character by character
    job_search_box.send_keys(char)
    random_sleep(0.1, 0.3)

# Step 6: Enter the location in the search box
location_search_box = driver.find_element(By.ID, "text-input-where")
location_search_box.clear()  # Clear the default location
random_sleep(1, 2)
location_search_box.send_keys("Lahore")  # Set the location to Lahore
random_sleep(1, 2)

# Submit the search by pressing ENTER
location_search_box.send_keys(Keys.ENTER)
random_sleep(5, 7)

# Function to scrape job details from the current page
def scrape_jobs_from_page():
    """
    Scrapes job details from the current page and saves them to both CSV and Excel files.
    """
    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')  # Locate job postings
    for job in jobs:
        try:
            # Click the job title link to open the job description
            job_title_link = job.find_element(By.CSS_SELECTOR, 'a.jcs-JobTitle')
            job_title_link.click()
            random_sleep(3, 5)

            # Extract job details from the job description page
            job_title = driver.find_element(By.XPATH, "//h2[contains(@class,'jobsearch-JobInfoHeader-title')]").text
            company_name = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyName']").text
            job_location = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyLocation']").text
            job_description = driver.find_element(By.ID, "jobDescriptionText").text

            # Write the extracted details to the CSV file
            csv_writer.writerow([job_title, company_name, job_location, job_description])

            # Write the extracted details to the Excel sheet
            ws.append([job_title, company_name, job_location, job_description])

            # Navigate back to the job listings page
            driver.back()
            random_sleep(5, 7)
        except Exception as e:
            # Log any errors encountered during scraping
            print(f"Failed to process job due to: {str(e)}")
            continue

# Function to navigate to the next page of job listings
def go_to_next_page():
    """
    Navigates to the next page of job postings if available.

    Returns:
        bool: True if successfully navigated to the next page, False otherwise.
    """
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a[data-testid='pagination-page-next']")
        next_button.click()  # Click the "Next Page" button
        random_sleep(5, 7)
        return True
    except:
        # If the "Next Page" button is not found, assume the end of pagination
        print("No more pages to navigate.")
        return False

# Step 7: Loop through all pages and scrape job listings
while True:
    scrape_jobs_from_page()  # Scrape jobs on the current page
    if not go_to_next_page():  # Navigate to the next page, if available
        break

# Step 8: Clean up and save the data
driver.quit()  # Close the browser
csv_file.close()  # Close the CSV file
wb.save("lahore_python_jobs.xlsx")  # Save the Excel workbook

print("Job scraping completed. Data saved to 'lahore_python_jobs.csv' and 'lahore_python_jobs.xlsx'.")


**ProConnect360**: This advanced scraping solution efficiently collects and processes faculty profiles from university websites. It scrapes all pages at once, ensuring no profiles are missed due to pagination issues. By gathering all URLs first and leveraging multi-threading, it maximizes resource utilization, reduces reload times, and provides detailed outputs with names, roles, and contacts.

In [None]:
%%shell
# Purpose:
# This script resolves the issue of Chromium no longer being distributed outside Snap on Ubuntu.
# The solution involves adding Debian Buster repositories and configuring package preferences to install Chromium.

# Step 1: Add Debian Buster repositories
# These repositories provide the necessary packages for Chromium installation.
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Step 2: Add Debian GPG keys
# These keys are required to authenticate the Debian repositories and ensure secure package installation.
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

# Convert the keys to the gpg format and save them to the appropriate directory
apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Step 3: Set package preferences for Chromium
# This ensures that only Chromium-related packages are fetched from the Debian Buster repositories,
# while other packages remain sourced from the default Ubuntu repositories.

# Note:
# The double-blank lines between entries are required for proper configuration.
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300
EOF


Executing: /tmp/apt-key-gpghome.umfWvzIHqW/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.LTOQShlU5G/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.GiZGEZ3D3n/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1




**Set Up**

In [None]:
!pip install selenium
!apt-get update
!apt install -y chromium-chromedriver
!apt install chromium-chromedriver

Collecting selenium
  Downloading selenium-4.24.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.24.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m21.

**Arguments of Driver Set Up**

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Setup Chrome options for Selenium in Colab
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Initialize the Chrome WebDriver
driver = webdriver.Chrome(options=chrome_options)


**Single Threaded Approach**

In [None]:
"""
Project: ProConnect360 - Multi-threaded Faculty Profile Scraper

Description:
This script is designed to scrape personnel profiles, such as faculty or staff members, from a paginated website.
The script first collects all profile links from all pages, ensuring comprehensive data collection.
It then scrapes each profile using multi-threading, significantly reducing the scraping time.

Features:
1. Collects all profile links across multiple pages to avoid missing any data due to pagination issues.
2. Scrapes detailed profile information, including name, description, employment status, and email.
3. Utilizes multi-threading to scrape multiple profiles concurrently, improving efficiency.
4. Handles dynamic page navigation using JavaScript execution.
5. Includes error handling for robustness and reliability.

Modules Used:
- Selenium: For browser automation and interaction with web elements.
- concurrent.futures: For implementing multi-threading.
- time: For adding delays to mimic human behavior and ensure proper page loading.

"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Step 1: Set up Chrome options for headless browsing
# This allows the browser to run in the background without opening a GUI, improving speed and efficiency.
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')  # Disable sandbox for compatibility
chrome_options.add_argument('--headless')  # Enable headless mode
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Disable GPU for better performance in headless mode

# Initialize the WebDriver for the main scraping workflow
driver = webdriver.Chrome(options=chrome_options)

# Function to scrape a single staff profile
def scrape_staff_profile(profile_url):
    """
    Scrapes detailed information from a single profile page.

    Args:
        profile_url (str): URL of the individual profile page.

    Returns:
        dict: A dictionary containing the scraped profile data.
    """
    # Each thread initializes its own WebDriver instance
    local_driver = webdriver.Chrome(options=chrome_options)
    local_driver.get(profile_url)
    time.sleep(2)  # Wait for the page to fully load

    staff_data = {}

    try:
        # Extract profile details
        staff_data['url'] = profile_url
        staff_data['name'] = local_driver.find_element(By.CSS_SELECTOR, "h3").text
        staff_data['description'] = local_driver.find_element(By.CSS_SELECTOR, ".title.mb-3 span").text
        staff_data['employment_status'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-solid.fa-circle-dashed + span").text
        staff_data['status'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-user-tie + span").text
        staff_data['email'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-envelope + span").text
    except Exception as e:
        # Log errors during profile scraping
        print(f"Error extracting data from {profile_url}: {str(e)}")
    finally:
        # Quit the local WebDriver instance to release resources
        local_driver.quit()

    return staff_data

# Function to scrape profile links from the current page
def scrape_profile_links():
    """
    Scrapes all profile links from the current page.

    Returns:
        list: A list of profile URLs found on the page.
    """
    time.sleep(2)  # Wait for the page to fully load
    profiles = driver.find_elements(By.CSS_SELECTOR, ".faculty_block a")  # Locate profile links
    return [profile.get_attribute("href") for profile in profiles]

# Function to scrape all pages and collect profile links
def scrape_all_pages():
    """
    Collects all profile links across multiple pages.

    Returns:
        list: A list of all profile URLs collected from all pages.
    """
    all_profiles = []  # Initialize an empty list for storing all profile links
    page_number = 1  # Start with the first page

    while True:
        print(f"Scraping page {page_number} for profile links...")
        profile_links = scrape_profile_links()

        # Break the loop if no profiles are found on the current page
        if not profile_links:
            break

        all_profiles.extend(profile_links)  # Add the links to the main list

        # Navigate to the next page using JavaScript (adjust based on site structure)
        try:
            driver.execute_script(f"staff({page_number})")  # Simulate pagination
            page_number += 1
            time.sleep(5)  # Wait for the next page to load
        except Exception as e:
            print(f"Error navigating to page {page_number}: {e}")
            break

    return all_profiles

# Function to scrape profiles using multi-threading
def scrape_profiles_with_threads(profile_urls):
    """
    Scrapes multiple profiles concurrently using multi-threading.

    Args:
        profile_urls (list): List of profile URLs to scrape.

    Returns:
        list: A list of dictionaries containing the scraped profile data.
    """
    results = []  # Initialize a list to store the results

    # Use ThreadPoolExecutor to handle multi-threaded scraping
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers as needed
        future_to_url = {executor.submit(scrape_staff_profile, url): url for url in profile_urls}

        # Process completed tasks as they finish
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                if data:
                    results.append(data)  # Append the scraped data to the results list
                    print(f"Scraped: {data['name']} from {url}")
            except Exception as e:
                print(f"Error in thread for {url}: {e}")

    return results

# Main function to control the scraping workflow
def main():
    """
    Orchestrates the scraping workflow by collecting profile links,
    scraping data concurrently, and printing the results.
    """
    # Load the main website
    url = "https://www.uet.edu.pk/academics/staff/faculty-members"
    driver.get(url)
    time.sleep(2)  # Wait for the page to load

    # Scrape all profile links from all pages
    profile_links = scrape_all_pages()

    if profile_links:
        print(f"Found {len(profile_links)} profiles to scrape.")

        # Use multi-threading to scrape profiles
        scraped_data = scrape_profiles_with_threads(profile_links)

        # Print the scraped data
        for data in scraped_data:
            print(f"Name: {data['name']}, Description: {data['description']}, Employment Status: {data['employment_status']}, Status: {data['status']}, Email: {data['email']}")
    else:
        print("No profiles found.")

    # Close the main WebDriver
    driver.quit()

# Entry point for the script
if __name__ == "__main__":
    main()


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...


KeyboardInterrupt: 

**Multi Threaded Approach**


In [None]:
"""
Project: ProConnect360 - Multi-threaded Scraper for Personnel Profiles

Description:
This script is designed to scrape profiles of personnel (e.g., faculty or staff) from a paginated website.
It uses multi-threading to scrape multiple profiles concurrently, significantly reducing the total execution time.
The script extracts details such as names, descriptions, employment status, and emails for analysis or record-keeping.

Features:
1. Scrapes profile data from all pages dynamically.
2. Utilizes multi-threading to handle concurrent scraping tasks efficiently.
3. Extracts detailed information, including URLs, names, descriptions, and contact details.
4. Includes robust error handling for thread safety and resilience.
5. Designed to maximize resource utilization and minimize scraping time for large datasets.

Modules Used:
- Selenium: For web automation and interaction with dynamic web elements.
- concurrent.futures: For multi-threading to improve scraping efficiency.
- time: For managing delays and simulating human-like interactions.

"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Step 1: Configure Chrome options for headless browsing
# Headless mode allows the browser to operate without a graphical user interface.
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')  # Disable sandbox for compatibility
chrome_options.add_argument('--headless')  # Enable headless mode for faster execution
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Disable GPU rendering for headless mode

# Initialize the WebDriver for primary navigation
driver = webdriver.Chrome(options=chrome_options)

# Function to scrape a single staff profile
def scrape_staff_profile(profile_url):
    """
    Scrapes detailed information from an individual profile page.

    Args:
        profile_url (str): URL of the profile to scrape.

    Returns:
        dict: A dictionary containing the scraped profile details.
    """
    # Each thread uses a separate WebDriver instance
    local_driver = webdriver.Chrome(options=chrome_options)
    local_driver.get(profile_url)
    time.sleep(2)  # Wait for the page to load completely

    staff_data = {}

    try:
        # Extract details from the profile page
        staff_data['url'] = profile_url
        staff_data['name'] = local_driver.find_element(By.CSS_SELECTOR, "h3").text
        staff_data['description'] = local_driver.find_element(By.CSS_SELECTOR, ".title.mb-3 span").text
        staff_data['employment_status'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-solid.fa-circle-dashed + span").text
        staff_data['status'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-user-tie + span").text
        staff_data['email'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-envelope + span").text

        # Print extracted data for debugging
        print(f"\nScraped Profile:\nURL: {staff_data['url']}\nName: {staff_data['name']}\nDescription: {staff_data['description']}\nEmployment Status: {staff_data['employment_status']}\nStatus: {staff_data['status']}\nEmail: {staff_data['email']}")
    except Exception as e:
        # Handle errors and log the issue
        print(f"Error extracting data from {profile_url}: {str(e)}")
    finally:
        # Close the WebDriver instance to free up resources
        local_driver.quit()

    return staff_data

# Function to scrape profile links from the current page
def scrape_profile_links():
    """
    Scrapes all profile links from the current page.

    Returns:
        list: A list of profile URLs found on the page.
    """
    time.sleep(2)  # Ensure the page has fully loaded
    profiles = driver.find_elements(By.CSS_SELECTOR, ".faculty_block a")  # Locate profile links
    return [profile.get_attribute("href") for profile in profiles]

# Function to dynamically scrape all pages and dispatch tasks
def scrape_all_pages_with_threads():
    """
    Scrapes profile links from all pages and processes them using multi-threading.
    """
    page_number = 1  # Start with the first page

    # Use ThreadPoolExecutor for concurrent scraping
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []  # List to store future tasks

        while True:
            print(f"Scraping page {page_number} for URLs...")

            # Extract profile links from the current page
            profile_links = scrape_profile_links()

            # Break if no profiles are found, indicating the end of pages
            if not profile_links:
                break

            # Submit scraping tasks for each profile link
            for link in profile_links:
                futures.append(executor.submit(scrape_staff_profile, link))

            # Navigate to the next page if available
            try:
                driver.execute_script(f"staff({page_number})")  # Simulate pagination
                page_number += 1
                time.sleep(5)  # Allow time for the next page to load
            except Exception as e:
                print(f"Error navigating to page {page_number}: {e}")
                break

        # Wait for all scraping tasks to complete
        for future in as_completed(futures):
            try:
                # Get the result of the scraping task
                data = future.result()
                # Handle or store the data as needed
            except Exception as e:
                print(f"Error in a thread: {e}")

# Main function to control the workflow
def main():
    """
    Orchestrates the scraping process by loading the main site,
    scraping profile links, and processing the profiles using threads.
    """
    # Load the website to scrape
    url = "https://www.uet.edu.pk/academics/staff/faculty-members"
    driver.get(url)
    time.sleep(2)  # Wait for the initial page to load

    # Start scraping pages and profiles using threads
    scrape_all_pages_with_threads()

    # Close the primary WebDriver instance
    driver.quit()

# Entry point for the script
if __name__ == "__main__":
    main()


Scraping page 1 for URLs...
Scraping page 2 for URLs...
Scraping page 3 for URLs...
Scraping page 4 for URLs...

Scraped Profile: 
URL: https://staff.uet.edu.pk/profile/8
Name: Dr. Khurram Rashid
Description: Professor at Department of Architectural Engineering & Design, Main Campus UET Lahore
Employment Status: ON DUTY
Status: Regular
Email: khuram_ae@uet.edu.pk

Scraped Profile: 
URL: https://staff.uet.edu.pk/profile/17
Name: Mr. Jawad Ahamd Tahir
Description: Assistant Professor at Department of Architecture, Main Campus UET Lahore
Employment Status: ON DUTY
Status: Regular
Email: 

Scraped Profile: 
URL: https://staff.uet.edu.pk/profile/16
Name: Dr. Malik Usman Mehmood Awan
Description: Assistant Professor at Department of Architecture, Main Campus UET Lahore
Employment Status: ON DUTY
Status: Regular
Email: usmanawan@uet.edu.pk

Scraped Profile: 
URL: https://staff.uet.edu.pk/profile/5
Name: Mr. Imran Ahmad Saeed.
Description: Assistant Professor at Department of Architectural Eng



Error extracting data from https://staff.uet.edu.pk/profile/428: HTTPConnectionPool(host='localhost', port=59309): Max retries exceeded with url: /session/dcb77663b40cd140d4a526d0b95b5e26/element/f.DA99E98C9E3D7828DC8A4610B9C69C81.d.FF8D1A80B9020C046A5492DF3F745F9C.e.12/text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fcbcc160ca0>: Failed to establish a new connection: [Errno 111] Connection refused'))


KeyboardInterrupt: 

**Question no 3**



In [None]:
"""
Project: ProConnect360 - Multi-threaded Scraper with CSV Export

Description:
This script scrapes personnel profiles from a paginated website using Selenium and saves the data in a CSV file.
It utilizes multi-threading to scrape multiple profiles concurrently, significantly reducing execution time.
The script dynamically navigates pages, extracts profile links, and gathers details like names, descriptions,
employment status, and emails.

Features:
1. Dynamically scrapes all pages to collect profile links and processes profiles concurrently.
2. Extracts details such as names, descriptions, employment status, and contact emails.
3. Uses multi-threading for efficient data scraping across multiple profiles.
4. Saves the scraped data into a structured CSV format for analysis or sharing.
5. Includes error handling for resilient and robust scraping.

Modules Used:
- Selenium: For web automation and interaction with dynamic web elements.
- concurrent.futures: For multi-threading to handle tasks concurrently.
- pandas: For organizing and saving the scraped data in a CSV file.
- time: For managing delays to ensure proper page loading and mimic human-like behavior.

"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd  # Used for saving data to CSV
import time

# Step 1: Configure Chrome options for headless browsing
# Headless mode allows for running the browser without a GUI, improving speed and resource usage.
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')  # Disable sandbox for compatibility
chrome_options.add_argument('--headless')  # Enable headless mode for faster execution
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Disable GPU rendering in headless mode

# Initialize the WebDriver for primary navigation
driver = webdriver.Chrome(options=chrome_options)

# Initialize a list to store all scraped profile data
all_scraped_data = []

# Function to scrape a single staff profile
def scrape_staff_profile(profile_url):
    """
    Scrapes detailed information from an individual profile page.

    Args:
        profile_url (str): URL of the profile to scrape.

    Returns:
        dict: A dictionary containing the scraped profile details.
    """
    # Each thread initializes its own WebDriver instance
    local_driver = webdriver.Chrome(options=chrome_options)
    local_driver.get(profile_url)
    time.sleep(2)  # Wait for the page to fully load

    staff_data = {}

    try:
        # Extract details from the profile page
        staff_data['url'] = profile_url
        staff_data['name'] = local_driver.find_element(By.CSS_SELECTOR, "h3").text
        staff_data['description'] = local_driver.find_element(By.CSS_SELECTOR, ".title.mb-3 span").text
        staff_data['employment_status'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-solid.fa-circle-dashed + span").text
        staff_data['status'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-user-tie + span").text
        staff_data['email'] = local_driver.find_element(By.CSS_SELECTOR, ".fa-envelope + span").text
    except Exception as e:
        # Log any errors encountered while scraping
        print(f"Error extracting data from {profile_url}: {str(e)}")
    finally:
        # Quit the WebDriver instance to release resources
        local_driver.quit()

    return staff_data

# Function to scrape profile links from the current page
def scrape_profile_links():
    """
    Scrapes all profile links from the current page.

    Returns:
        list: A list of profile URLs found on the page.
    """
    time.sleep(2)  # Wait for the page to fully load
    profiles = driver.find_elements(By.CSS_SELECTOR, ".faculty_block a")  # Locate profile links
    return [profile.get_attribute("href") for profile in profiles]

# Function to scrape all pages and dispatch scraping tasks
def scrape_all_pages_with_threads():
    """
    Scrapes profile links from all pages and processes them using multi-threading.
    """
    page_number = 1  # Start with the first page

    # Use ThreadPoolExecutor for concurrent scraping
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers for optimal performance
        futures = []  # List to store future tasks

        while True:
            print(f"Scraping page {page_number} for URLs...")

            # Extract profile links from the current page
            profile_links = scrape_profile_links()

            # Break if no profiles are found, indicating the end of pages
            if not profile_links:
                break

            # Submit scraping tasks for each profile link
            for link in profile_links:
                futures.append(executor.submit(scrape_staff_profile, link))

            # Navigate to the next page if available
            try:
                driver.execute_script(f"staff({page_number})")  # Simulate pagination using JavaScript
                page_number += 1
                time.sleep(5)  # Allow time for the next page to load
            except Exception as e:
                print(f"Error navigating to page {page_number}: {e}")
                break

        # Wait for all scraping tasks to complete
        for future in as_completed(futures):
            try:
                # Get the result of the scraping task
                data = future.result()
                if data:
                    all_scraped_data.append(data)  # Append the data to the results list
            except Exception as e:
                print(f"Error in a thread: {e}")

# Main function to control the workflow
def main():
    """
    Orchestrates the scraping process by loading the website,
    scraping profile links, and processing profiles using threads.
    """
    # Load the main website
    url = "https://www.uet.edu.pk/academics/staff/faculty-members"
    driver.get(url)
    time.sleep(2)  # Wait for the initial page to load

    # Start scraping pages and profiles
    scrape_all_pages_with_threads()

    # Close the primary WebDriver instance
    driver.quit()

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(all_scraped_data)

    # Save the scraped data to a CSV file
    df.to_csv("scraped_profiles.csv", index=False)  # index=False to exclude row numbers in the CSV

    print("Data has been saved to 'scraped_profiles.csv'")

# Entry point for the script
if __name__ == "__main__":
    main()


Scraping page 1 for URLs...
Scraping page 2 for URLs...
Scraping page 3 for URLs...
Scraping page 4 for URLs...
Scraping page 5 for URLs...
Scraping page 6 for URLs...
Scraping page 7 for URLs...
Scraping page 8 for URLs...
Scraping page 9 for URLs...
Scraping page 10 for URLs...
Scraping page 11 for URLs...
Scraping page 12 for URLs...
Scraping page 13 for URLs...
Scraping page 14 for URLs...
Scraping page 15 for URLs...
Scraping page 16 for URLs...
Scraping page 17 for URLs...
Scraping page 18 for URLs...
Scraping page 19 for URLs...
Scraping page 20 for URLs...
Scraping page 21 for URLs...
Scraping page 22 for URLs...
Scraping page 23 for URLs...
Scraping page 24 for URLs...
Scraping page 25 for URLs...
Scraping page 26 for URLs...
Scraping page 27 for URLs...
Scraping page 28 for URLs...
Scraping page 29 for URLs...
Scraping page 30 for URLs...
Scraping page 31 for URLs...
Scraping page 32 for URLs...
Scraping page 33 for URLs...
Scraping page 34 for URLs...
Scraping page 35 for UR

**Project: JobScrape360 - Python Job Scraper for Lahore**

Description:
This script automates the process of scraping Python job postings from Indeed Pakistan.
It extracts information such as job titles, company names, locations, and job descriptions,
and saves the data in both CSV and Excel formats for further analysis.

Approach for printing on screen/console

In [None]:
"""
Project: JobScrape360 - Python Job Scraper for Lahore

Description:
This script automates the process of scraping Python job postings from Indeed Pakistan.
It extracts information such as job titles, company names, locations, and job descriptions,
and saves the data in both CSV and Excel formats for further analysis.

Features:
1. Automates job search for "Python Developer" in Lahore on Indeed.
2. Extracts details such as job title, company name, location, and description.
3. Saves the scraped data to a CSV file and an Excel workbook for convenience.
4. Implements pagination to navigate through all job listings.
5. Simulates human-like behavior with randomized delays to avoid bot detection.

Modules Used:
- Selenium: For browser automation and interaction with dynamic elements.
- random: To add randomized delays for human-like interactions.
- csv: To write scraped data to a structured CSV file.
- openpyxl: To save data into an Excel workbook.
- time: For managing delays during scraping.

"""

import time
import random
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook

# Helper function to simulate human-like random delays
def random_sleep(min_time=2, max_time=5):
    """
    Introduces a randomized delay to mimic human interaction.

    Args:
        min_time (int): Minimum sleep time in seconds.
        max_time (int): Maximum sleep time in seconds.
    """
    time.sleep(random.uniform(min_time, max_time))

# Step 1: Set up the WebDriver (Chrome in this case)
# Using ChromeDriverManager to manage driver installation automatically
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.implicitly_wait(10)  # Set an implicit wait for element loading

# Step 2: Create a CSV file and write the headers
csv_file = open('lahore_python_jobs.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Job Title', 'Company Name', 'Location', 'Job Description'])  # Add column headers

# Step 3: Create an Excel workbook and add headers
wb = Workbook()
ws = wb.active
ws.title = "Python Jobs"
ws.append(['Job Title', 'Company Name', 'Location', 'Job Description'])  # Add column headers

# Step 4: Navigate to Indeed Pakistan job search page
driver.get("https://pk.indeed.com/")
random_sleep(3, 6)

# Step 5: Enter the job title in the search box
job_search_box = driver.find_element(By.ID, "text-input-what")
for char in "Python Developer":  # Simulate typing for "Python Developer"
    job_search_box.send_keys(char)
    random_sleep(0.1, 0.3)

# Step 6: Enter the location in the location search box
location_search_box = driver.find_element(By.ID, "text-input-where")
location_search_box.clear()  # Clear the default location
random_sleep(1, 2)
location_search_box.send_keys("Lahore")  # Set the location to Lahore
random_sleep(1, 2)

# Submit the search by pressing ENTER
location_search_box.send_keys(Keys.ENTER)
random_sleep(5, 7)

# Function to scrape jobs from the current page
def scrape_jobs_from_page():
    """
    Scrapes job details from the current page and writes them to CSV and Excel files.
    """
    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')  # Locate job postings
    for job in jobs:
        try:
            # Click the job title link to open the job description
            job_title_link = job.find_element(By.CSS_SELECTOR, 'a.jcs-JobTitle')
            job_title = job_title_link.text
            job_title_link.click()
            random_sleep(3, 5)

            # Extract job details
            job_title_on_page = driver.find_element(By.XPATH, "//h2[contains(@class,'jobsearch-JobInfoHeader-title')]").text
            company_name = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyName']").text
            job_location = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyLocation']").text
            job_description = driver.find_element(By.ID, "jobDescriptionText").text

            # Save details to the CSV file
            csv_writer.writerow([job_title_on_page, company_name, job_location, job_description])

            # Save details to the Excel sheet
            ws.append([job_title_on_page, company_name, job_location, job_description])

            # Navigate back to the main job list
            driver.back()
            random_sleep(5, 7)
        except Exception as e:
            # Log any errors encountered during scraping
            print(f"Failed to process job due to {str(e)}")
            continue

# Function to navigate to the next page
def go_to_next_page():
    """
    Navigates to the next page of job postings if available.

    Returns:
        bool: True if next page is navigated, False otherwise.
    """
    try:
        # Locate and click the "Next Page" button
        next_button = driver.find_element(By.CSS_SELECTOR, "a[data-testid='pagination-page-next']")
        next_button.click()
        random_sleep(5, 7)
        return True
    except:
        # If "Next Page" button is not found, log the end of pagination
        print("No more pages to navigate.")
        return False

# Step 7: Loop through all pages and scrape jobs
while True:
    scrape_jobs_from_page()  # Scrape jobs on the current page
    if not go_to_next_page():  # Attempt to go to the next page
        break

# Step 8: Clean up and save files
driver.quit()  # Close the browser
csv_file.close()  # Close the CSV file

# Save the Excel workbook
wb.save("lahore_python_jobs.xlsx")
print("Job scraping completed. Data saved to 'lahore_python_jobs.csv' and 'lahore_python_jobs.xlsx'.")


In [None]:
"""
Project: JobScrape360 - Save Job Listings to CSV and Excel

Description:
This script automates the process of scraping Python job listings from Indeed Pakistan for Lahore.
It extracts details such as job titles, company names, locations, and job descriptions,
and saves the data into both a CSV file and an Excel workbook for analysis.

Features:
1. Automates job search for "Python Developer" in Lahore on Indeed.
2. Extracts comprehensive details for each job posting.
3. Saves the scraped data to a structured CSV file and an Excel workbook.
4. Implements pagination to scrape job listings across multiple pages.
5. Simulates human-like interaction with random delays to reduce detection.

Modules Used:
- Selenium: For browser automation and interaction with dynamic elements.
- random: To add randomized delays for human-like interactions.
- csv: To save scraped data in CSV format.
- openpyxl: To save data into an Excel workbook.
- time: For managing delays during scraping.

"""

import time
import random
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook

# Helper function to simulate human-like random delays
def random_sleep(min_time=2, max_time=5):
    """
    Introduces a randomized delay to mimic human interaction.

    Args:
        min_time (int): Minimum delay in seconds.
        max_time (int): Maximum delay in seconds.
    """
    time.sleep(random.uniform(min_time, max_time))

# Step 1: Set up the WebDriver (Chrome in this case)
# Using ChromeDriverManager to ensure the correct driver version is installed automatically.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.implicitly_wait(10)  # Set an implicit wait for locating elements

# Step 2: Create a CSV file and write the headers
csv_file = open('lahore_python_jobs.csv', mode='w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Job Title', 'Company Name', 'Location', 'Job Description'])  # Add column headers

# Step 3: Create an Excel workbook and write the headers
wb = Workbook()
ws = wb.active
ws.title = "Python Jobs"
ws.append(['Job Title', 'Company Name', 'Location', 'Job Description'])  # Add column headers

# Step 4: Navigate to the Indeed Pakistan job search page
driver.get("https://pk.indeed.com/")
random_sleep(3, 6)

# Step 5: Enter the job title in the search box
job_search_box = driver.find_element(By.ID, "text-input-what")
for char in "Python Developer":  # Simulate typing character by character
    job_search_box.send_keys(char)
    random_sleep(0.1, 0.3)

# Step 6: Enter the location in the search box
location_search_box = driver.find_element(By.ID, "text-input-where")
location_search_box.clear()  # Clear the default location
random_sleep(1, 2)
location_search_box.send_keys("Lahore")  # Set the location to Lahore
random_sleep(1, 2)

# Submit the search by pressing ENTER
location_search_box.send_keys(Keys.ENTER)
random_sleep(5, 7)

# Function to scrape job details from the current page
def scrape_jobs_from_page():
    """
    Scrapes job details from the current page and saves them to both CSV and Excel files.
    """
    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')  # Locate job postings
    for job in jobs:
        try:
            # Click the job title link to open the job description
            job_title_link = job.find_element(By.CSS_SELECTOR, 'a.jcs-JobTitle')
            job_title_link.click()
            random_sleep(3, 5)

            # Extract job details from the job description page
            job_title = driver.find_element(By.XPATH, "//h2[contains(@class,'jobsearch-JobInfoHeader-title')]").text
            company_name = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyName']").text
            job_location = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyLocation']").text
            job_description = driver.find_element(By.ID, "jobDescriptionText").text

            # Write the extracted details to the CSV file
            csv_writer.writerow([job_title, company_name, job_location, job_description])

            # Write the extracted details to the Excel sheet
            ws.append([job_title, company_name, job_location, job_description])

            # Navigate back to the job listings page
            driver.back()
            random_sleep(5, 7)
        except Exception as e:
            # Log any errors encountered during scraping
            print(f"Failed to process job due to: {str(e)}")
            continue

# Function to navigate to the next page of job listings
def go_to_next_page():
    """
    Navigates to the next page of job postings if available.

    Returns:
        bool: True if successfully navigated to the next page, False otherwise.
    """
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a[data-testid='pagination-page-next']")
        next_button.click()  # Click the "Next Page" button
        random_sleep(5, 7)
        return True
    except:
        # If the "Next Page" button is not found, assume the end of pagination
        print("No more pages to navigate.")
        return False

# Step 7: Loop through all pages and scrape job listings
while True:
    scrape_jobs_from_page()  # Scrape jobs on the current page
    if not go_to_next_page():  # Navigate to the next page, if available
        break

# Step 8: Clean up and save the data
driver.quit()  # Close the browser
csv_file.close()  # Close the CSV file
wb.save("lahore_python_jobs.xlsx")  # Save the Excel workbook

print("Job scraping completed. Data saved to 'lahore_python_jobs.csv' and 'lahore_python_jobs.xlsx'.")
