# install library

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os


## Web Crawling Process

### Sending a Request to the Website
In this step, we send an HTTP request to the target website to ensure the connection is successful and retrieve the HTML content of the main page.


In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}


url = "https://muqawil.org/en/contractors"
response = requests.get(url, headers=headers)

# Print the response status
print(response.status_code)


200


### Extracting Contractor Links
Here, we parse the HTML content of the main page to find and collect all the contractor links for further data extraction.

In [3]:
soup = BeautifulSoup(response.text, 'html.parser')

# Step 1: Extract all contractor links
links = []
for a_tag in soup.find_all('a', href=True):
    if "/contractors/" in a_tag['href']:

        full_link =  a_tag['href']
        links.append(full_link)

# Print number of links
print(f"Found {len(links)} contractor pages.")


Found 41 contractor pages.


### Removing Duplicate Links
In this step, we ensure that the list of contractor links contains only unique entries by converting the list into a set and then back into a list. This avoids processing the same link multiple times.

In [4]:
# To remove duplicate liks
links = list(set(links))

# print links to check
for link in links:
    print(link)
print(f"Found {len(links)} contractor pages.")


https://muqawil.org/en/contractors/20023122/143
https://muqawil.org/en/contractors/20001440/143
https://muqawil.org/en/contractors/20005421/143
https://muqawil.org/en/contractors/map
https://muqawil.org/en/contractors/20001947/143
https://muqawil.org/en/contractors/20044037/143
https://muqawil.org/en/contractors/959/143
https://muqawil.org/en/contractors/20015654/143
https://muqawil.org/en/contractors/20006199/143
https://muqawil.org/en/contractors/20014964/143
https://muqawil.org/en/contractors/20019229/143
https://muqawil.org/en/contractors/20011010/143
https://muqawil.org/en/contractors/20001577/143
https://muqawil.org/en/contractors/20002330/143
https://muqawil.org/en/contractors/20060304/143
https://muqawil.org/en/contractors/20008518/143
https://muqawil.org/en/contractors/20063249/143
https://muqawil.org/en/contractors/20057392/143
https://muqawil.org/en/contractors/20053278/143
https://muqawil.org/en/contractors/20055348/143
https://muqawil.org/en/contractors/20010655/143
Found 

### Collect Contractor Information

In this step, we extract the necessary data fields from each contractor's individual page.


In [5]:
# Initialize a list to store contractor data
contractor_data = []

# Step 2: Loop through each link to extract data
for link in links:
    try:
        response = requests.get(link, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract data
        name = soup.find('h3', class_='card-title').text.strip()

        membership_number = None
        membership_type = None
        company_size = None
        training_hours = None
        city = None
        region = None
        organization_mobile = None
        organization_email = None
        address = None

        # Find all "info-value" elements and match them with their labels
        info_blocks = soup.find_all('div', class_='info-box')
        for block in info_blocks:
            label = block.find('div', class_='info-name').text.strip()
            value = block.find('div', class_='info-value').text.strip()
            email_block = soup.find('div', class_='info-value')


            if label == "Membership Number":
                membership_number = value
            elif label == "Membership":
                membership_type = value
            elif label == "Company Size":
                company_size = value
            elif label == "Training credit hours":
                training_hours = value
            elif label == "City":
                city = value
            elif label == "Region":
                region = value
            elif label == "Organization Mobile Number":
                organization_mobile = value
            elif label == "Organization Email":
                organization_email = value
            elif label == "Address":
                address = value



        # Extract Interests
        interests_section = soup.find('h3', text='Interests')
        if interests_section:
            interests_list = interests_section.find_next('ul', class_='list-numerical')
            if interests_list:
                interests = [li.text.strip() for li in interests_list.find_all('li')]



        # Append the contractor's data to the list
        contractor_data.append({
            "Company Name": name,
            "Membership Number": membership_number,
            "Membership Type": membership_type,
            "Company Size": company_size,
            "Training Credit Hours": training_hours,
            "City": city,
            "Region": region,
            "Organization Mobile Number": organization_mobile,
            "Organization Email": organization_email,
            "Address": address,
            "Interests": ", ".join(interests) if interests else None
        })

        # Add a delay to avoid overloading the server
        time.sleep(1)

    except Exception as e:
        print(f"Error scraping {link}: {e}")
        continue

Error scraping https://muqawil.org/en/contractors/map: 'NoneType' object has no attribute 'text'


In [6]:
# Get current directory
df_contractor_data = pd.DataFrame(contractor_data)

current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()

# Define output path
output_dir = os.path.join(current_dir, "raw_data")
output_path = os.path.join(output_dir, "Full_contractors_data.csv")

# Create folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save scraped data to CSV
df_contractor_data.to_csv(output_path, index=False)

# Print confirmation
print("Data saved to:", output_path)


Data saved to: /Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/notebooks/raw_data/Full_contractors_data.csv


In [7]:
df_contractor_data.head()

Unnamed: 0,Company Name,Membership Number,Membership Type,Company Size,Training Credit Hours,City,Region,Organization Mobile Number,Organization Email,Address,Interests
0,Gulf Pioneers Trading Company,280728074,Saudi Contractor,Medium Company Size,9 h,RIYADH,Riyadh,112885557,[email protected],ALtahlia Street\r\nOlaya\r\nRiyadh\r\n54355\r\...,"Construction of buildings, Construction of bui..."
1,Ratel Al Sharq Contracting Company,101010104,Saudi Contractor,Small Company Size,0 h,RIYADH,Riyadh,542332224,[email protected],Riyadh - Al Narjis District - Anas Bin Malik Road,"Construction of buildings, Construction of bui..."
2,Bunoon Wa Funoon Contracting Co.,138113818,Saudi Contractor,Small Company Size,41 h,RIYADH,Riyadh,920015058,[email protected],"6394, Al Olaya\r\n3891, Al Olaya Dist.\r\nRIYA...","Construction of buildings, Construction of bui..."
3,Al-Wessam Contracting Company,106110611,Saudi Contractor,Small Company Size,93 h,BILJURASHI,Bahah,505643337,[email protected],Al Wessam Contracting Company,"Mining support services, Oil and natural gas e..."
4,TAKWEEN AND CONSTRUCTION,438043804,Saudi Contractor,Very Small Company Size,0 h,RIYADH,Riyadh,503047700,[email protected],.,"Construction of buildings, Construction of bui..."


#### Handling Protected Emails

In [8]:
# Regular expression for email validation
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Initialize WebDriver with WebDriver Manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

email_data = []

try:
    for link in links:
        print(f"Processing: {link}")
        # Open the URL
        driver.get(link)

        # Extract company name
        try:
            company_name_element = driver.find_element(By.CLASS_NAME, "card-title")
            company_name = company_name_element.text.strip()
        except Exception as e:
            company_name = None
            print(f"Company Name not found for {link}: {e}")

        # Find all elements with the class that might contain the email
        elements = driver.find_elements(By.CLASS_NAME, "info-value")
        organization_email = None

        # Loop through the elements and look for valid emails
        for element in elements:
            text = element.text.strip()
            if re.match(EMAIL_REGEX, text):
                organization_email = text
                break  # Stop after finding the first email

        # Append data to the list
        email_data.append({
            "Company Name": company_name,
            "Organization Email": organization_email
        })

        time.sleep(1)

except Exception as e:
    print(f"Error: {e}")
finally:
    driver.quit()


# Dynamically determine the path to the 'emails_data.csv' file
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
output_path = os.path.join(current_dir, "raw_data", "emails_data.csv")

# Create a DataFrame for email data and save it to a CSV file
emails_df = pd.DataFrame(email_data)
emails_df.to_csv(output_path, index=False)

print("Emails saved to:", output_path)

Processing: https://muqawil.org/en/contractors/20023122/143
Processing: https://muqawil.org/en/contractors/20001440/143
Processing: https://muqawil.org/en/contractors/20005421/143
Processing: https://muqawil.org/en/contractors/map
Company Name not found for https://muqawil.org/en/contractors/map: Message: no such element: Unable to locate element: {"method":"css selector","selector":".card-title"}
  (Session info: chrome=131.0.6778.140); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x000000010ca7be12 chromedriver + 6696466
1   chromedriver                        0x000000010ca73c2a chromedriver + 6663210
2   chromedriver                        0x000000010c47be3e chromedriver + 405054
3   chromedriver                        0x000000010c4cc815 chromedriver + 735253
4   chromedriver                        0x000000010c4ccd11 chromedriver + 7

In [9]:
# Dynamically determine the path to the 'emails_data.csv' file
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
input_path = os.path.join(current_dir, "raw_data", "emails_data.csv")

# Read the CSV file using the relative path
email = pd.read_csv(input_path)
email

Unnamed: 0,Company Name,Organization Email
0,Gulf Pioneers Trading Company,Acc@Gpksa.Com
1,Ratel Al Sharq Contracting Company,Adel_77@Hotmail.Com
2,Bunoon Wa Funoon Contracting Co.,Info@Bfconst.Com
3,,
4,Al-Wessam Contracting Company,Alwessam7@Gmail.Com
5,TAKWEEN AND CONSTRUCTION,B2B@Tbco.Com.Sa
6,Alenjazat Contracting Company,Info@Alenjazat.Sa
7,Arabian Towers Projects Contracting Company,Atpcosecretary@Atpco-Sa.Com
8,Sharjah Development Contracting Co,Alsharqimna@Gmail.Com
9,Inmayoun Contracting Company,Inmayoun@Gmail.Com


#### Updating Organization Emails Based on Extracted Data

In [10]:
# Merge the two DataFrames based on 'Company Name'
merged_data = df_contractor_data.merge(emails_df[['Company Name', 'Organization Email']],
                                     on='Company Name',
                                     how='left',
                                     suffixes=('', '_new'))

# Replace the old 'Organization Email' with the new one
merged_data['Organization Email'] = merged_data['Organization Email_new']

# Drop the temporary column used for the merge
merged_data.drop(columns=['Organization Email_new'], inplace=True)

# Dynamically determine the path to save the updated data
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
output_path = os.path.join(current_dir, "raw_data", "Updated_contractors_data.csv")

# Save the updated contractors data to a new CSV file
merged_data.to_csv(output_path, index=False)

print("Updated data saved to Updated_contractors_data.csv")


Updated data saved to Updated_contractors_data.csv


In [11]:
# Dynamically determine the path to the 'Updated_contractors_data.csv' file
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
input_path = os.path.join(current_dir, "raw_data", "Updated_contractors_data.csv")

# Read the CSV file using the relative path
Updated_contractors_data = pd.read_csv(input_path)

# Display the DataFrame
Updated_contractors_data


Unnamed: 0,Company Name,Membership Number,Membership Type,Company Size,Training Credit Hours,City,Region,Organization Mobile Number,Organization Email,Address,Interests
0,Gulf Pioneers Trading Company,280728074,Saudi Contractor,Medium Company Size,9 h,RIYADH,Riyadh,112885557.0,Acc@Gpksa.Com,ALtahlia Street\r\nOlaya\r\nRiyadh\r\n54355\r\...,"Construction of buildings, Construction of bui..."
1,Ratel Al Sharq Contracting Company,101010104,Saudi Contractor,Small Company Size,0 h,RIYADH,Riyadh,542332224.0,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,"Construction of buildings, Construction of bui..."
2,Bunoon Wa Funoon Contracting Co.,138113818,Saudi Contractor,Small Company Size,41 h,RIYADH,Riyadh,920015058.0,Info@Bfconst.Com,"6394, Al Olaya\r\n3891, Al Olaya Dist.\r\nRIYA...","Construction of buildings, Construction of bui..."
3,Al-Wessam Contracting Company,106110611,Saudi Contractor,Small Company Size,93 h,BILJURASHI,Bahah,505643337.0,Alwessam7@Gmail.Com,Al Wessam Contracting Company,"Mining support services, Oil and natural gas e..."
4,TAKWEEN AND CONSTRUCTION,438043804,Saudi Contractor,Very Small Company Size,0 h,RIYADH,Riyadh,503047700.0,B2B@Tbco.Com.Sa,.,"Construction of buildings, Construction of bui..."
5,Alenjazat Contracting Company,10000937,Saudi Contractor,Medium Company Size,0 h,RIYADH,Riyadh,112001858.0,Info@Alenjazat.Sa,Riyadh- Ishbeliah district- alimam abdullah bi...,"Construction of buildings, Construction of bui..."
6,Arabian Towers Projects Contracting Company,220622064,Saudi Contractor,Medium Company Size,0 h,DAMMAM,Eastern Province,599999999.0,Atpcosecretary@Atpco-Sa.Com,Arabian Towers Projects Contracting Company-76...,"Construction of buildings, Construction of bui..."
7,Sharjah Development Contracting Co,144214428,Saudi Contractor,Small Company Size,0 h,AL KHOBAR,Eastern Province,138141668.0,Alsharqimna@Gmail.Com,(ABABTAIN TOWER - ALAGRABIA -THIRD FLOOR - OFF...,"Construction of buildings, Construction of bui..."
8,Inmayoun Contracting Company,215421544,Saudi Contractor,Medium Company Size,3 h,RIYADH,Riyadh,555090521.0,Inmayoun@Gmail.Com,abdelaziz,"Construction of buildings, Construction of bui..."
9,Dome Park Contracting Company,251725170,Saudi Contractor,Small Company Size,0 h,AL MUWAYH AL JADID,Makkah,553518088.0,Vv.Com838@Icloud.Com,um aldom,"Construction of buildings, Construction of bui..."


### Extracting Required Data Only


In [12]:
# Define the required columns based on the requested fields
required_columns = [
    "Company Name",
    "Membership Number",
    "Company Size",
    "Organization Email",
    "Address",
    "City",
    "Region",
    "Interests"
]

# Filter the DataFrame to include only the required columns
filtered_df = Updated_contractors_data[required_columns]

# Dynamically determine the path to save the filtered data
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
output_path = os.path.join(current_dir, "raw_data", "filtered_contractors_data.csv")

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv(output_path, index=False)

print("Filtered data saved to filtered_contractors_data.csv")


Filtered data saved to filtered_contractors_data.csv


In [13]:
# Dynamically determine the path to the 'filtered_contractors_data.csv' file
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
input_path = os.path.join(current_dir, "raw_data", "filtered_contractors_data.csv")

# Read the CSV file using the relative path
df = pd.read_csv(input_path)


In [14]:
df.head()

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests
0,Gulf Pioneers Trading Company,280728074,Medium Company Size,Acc@Gpksa.Com,ALtahlia Street\r\nOlaya\r\nRiyadh\r\n54355\r\...,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
1,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
2,Bunoon Wa Funoon Contracting Co.,138113818,Small Company Size,Info@Bfconst.Com,"6394, Al Olaya\r\n3891, Al Olaya Dist.\r\nRIYA...",RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
3,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e..."
4,TAKWEEN AND CONSTRUCTION,438043804,Very Small Company Size,B2B@Tbco.Com.Sa,.,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."


### Dealing with missing values

In [15]:
# Replace NaN values with 'Not Available'
df.fillna('Not Available', inplace=True)


In [16]:
df = df.replace('.', 'Not Available')
df = df.replace('No Data', 'Not Available')


In [17]:
df.head()

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests
0,Gulf Pioneers Trading Company,280728074,Medium Company Size,Acc@Gpksa.Com,ALtahlia Street\r\nOlaya\r\nRiyadh\r\n54355\r\...,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
1,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
2,Bunoon Wa Funoon Contracting Co.,138113818,Small Company Size,Info@Bfconst.Com,"6394, Al Olaya\r\n3891, Al Olaya Dist.\r\nRIYA...",RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
3,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e..."
4,TAKWEEN AND CONSTRUCTION,438043804,Very Small Company Size,B2B@Tbco.Com.Sa,Not Available,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."


In [18]:
# Dynamically determine the path to save 'filtered_contractors_data.csv'
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
output_path = os.path.join(current_dir, "raw_data", "filtered_contractors_data.csv")

# Save the DataFrame to a CSV file using the relative path
df.to_csv(output_path, index=False)

print("Data saved to:", output_path)


Data saved to: /Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/notebooks/raw_data/filtered_contractors_data.csv


In [19]:
# Dynamically determine the path to 'filtered_contractors_data.csv'
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
input_path = os.path.join(current_dir, "raw_data", "filtered_contractors_data.csv")

# Load the CSV file using the relative path
df2 = pd.read_csv(input_path)


In [20]:
df2.head()

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests
0,Gulf Pioneers Trading Company,280728074,Medium Company Size,Acc@Gpksa.Com,ALtahlia Street\r\nOlaya\r\nRiyadh\r\n54355\r\...,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
1,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
2,Bunoon Wa Funoon Contracting Co.,138113818,Small Company Size,Info@Bfconst.Com,"6394, Al Olaya\r\n3891, Al Olaya Dist.\r\nRIYA...",RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
3,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e..."
4,TAKWEEN AND CONSTRUCTION,438043804,Very Small Company Size,B2B@Tbco.Com.Sa,Not Available,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
