# install library

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


## Web Crawling Process

### Sending a Request to the Website
In this step, we send an HTTP request to the target website to ensure the connection is successful and retrieve the HTML content of the main page.


In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}


url = "https://muqawil.org/en/contractors"
response = requests.get(url, headers=headers)

# Print the response status
print(response.status_code)


200


### Extracting Contractor Links
Here, we parse the HTML content of the main page to find and collect all the contractor links for further data extraction.

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

# Step 1: Extract all contractor links
links = []
for a_tag in soup.find_all('a', href=True):
    if "/contractors/" in a_tag['href']:

        full_link =  a_tag['href']
        links.append(full_link)

# Print number of links
print(f"Found {len(links)} contractor pages.")


Found 41 contractor pages.


### Removing Duplicate Links
In this step, we ensure that the list of contractor links contains only unique entries by converting the list into a set and then back into a list. This avoids processing the same link multiple times.

In [5]:
# To remove duplicate liks
links = list(set(links))

# print links to check
for link in links:
    print(link)
print(f"Found {len(links)} contractor pages.")


https://muqawil.org/en/contractors/20001947/143
https://muqawil.org/en/contractors/20014964/143
https://muqawil.org/en/contractors/20008518/143
https://muqawil.org/en/contractors/20045121/143
https://muqawil.org/en/contractors/20001440/143
https://muqawil.org/en/contractors/20019229/143
https://muqawil.org/en/contractors/20055348/143
https://muqawil.org/en/contractors/20010655/143
https://muqawil.org/en/contractors/20015654/143
https://muqawil.org/en/contractors/20026732/143
https://muqawil.org/en/contractors/20023122/143
https://muqawil.org/en/contractors/20001577/143
https://muqawil.org/en/contractors/20053278/143
https://muqawil.org/en/contractors/20044037/143
https://muqawil.org/en/contractors/20057392/143
https://muqawil.org/en/contractors/20060304/143
https://muqawil.org/en/contractors/20006199/143
https://muqawil.org/en/contractors/959/143
https://muqawil.org/en/contractors/map
https://muqawil.org/en/contractors/20005421/143
https://muqawil.org/en/contractors/20002330/143
Found 

### Collect Contractor Information

In this step, we extract the necessary data fields from each contractor's individual page.


In [6]:
# Initialize a list to store contractor data
contractor_data = []

# Step 2: Loop through each link to extract data
for link in links:
    try:
        response = requests.get(link, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract data
        name = soup.find('h3', class_='card-title').text.strip()

        membership_number = None
        membership_type = None
        company_size = None
        training_hours = None
        city = None
        region = None
        organization_mobile = None
        organization_email = None
        address = None

        # Find all "info-value" elements and match them with their labels
        info_blocks = soup.find_all('div', class_='info-box')
        for block in info_blocks:
            label = block.find('div', class_='info-name').text.strip()
            value = block.find('div', class_='info-value').text.strip()
            email_block = soup.find('div', class_='info-value')


            if label == "Membership Number":
                membership_number = value
            elif label == "Membership":
                membership_type = value
            elif label == "Company Size":
                company_size = value
            elif label == "Training credit hours":
                training_hours = value
            elif label == "City":
                city = value
            elif label == "Region":
                region = value
            elif label == "Organization Mobile Number":
                organization_mobile = value
            elif label == "Organization Email":
                organization_email = value
            elif label == "Address":
                address = value



        # Extract Interests
        interests_section = soup.find('h3', text='Interests')
        if interests_section:
            interests_list = interests_section.find_next('ul', class_='list-numerical')
            if interests_list:
                interests = [li.text.strip() for li in interests_list.find_all('li')]



        # Append the contractor's data to the list
        contractor_data.append({
            "Company Name": name,
            "Membership Number": membership_number,
            "Membership Type": membership_type,
            "Company Size": company_size,
            "Training Credit Hours": training_hours,
            "City": city,
            "Region": region,
            "Organization Mobile Number": organization_mobile,
            "Organization Email": organization_email,
            "Address": address,
            "Interests": ", ".join(interests) if interests else None
        })

        # Add a delay to avoid overloading the server
        time.sleep(1)

    except Exception as e:
        print(f"Error scraping {link}: {e}")
        continue

Error scraping https://muqawil.org/en/contractors/map: 'NoneType' object has no attribute 'text'


In [7]:
# Save data to a CSV file
dfـcontractor_data = pd.DataFrame(contractor_data)
dfـcontractor_data.to_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/Full_contractors_data.csv', index=False)
print("Data saved to Full_contractors_data.csv")


Data saved to Full_contractors_data.csv


In [8]:
# dfـcontractor_data= pd.read_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/Full_contractors_data.csv')
dfـcontractor_data

Unnamed: 0,Company Name,Membership Number,Membership Type,Company Size,Training Credit Hours,City,Region,Organization Mobile Number,Organization Email,Address,Interests
0,Al-Wessam Contracting Company,106110611,Saudi Contractor,Small Company Size,93 h,BILJURASHI,Bahah,505643337.0,[email protected],Al Wessam Contracting Company,"Mining support services, Oil and natural gas e..."
1,Inmayoun Contracting Company,215421544,Saudi Contractor,Medium Company Size,3 h,RIYADH,Riyadh,555090521.0,[email protected],abdelaziz,"Construction of buildings, Construction of bui..."
2,Awared General Contracting Company,160916095,Saudi Contractor,Small Company Size,90 h,RIYADH,Riyadh,112090111.0,[email protected],Riyadh - alezdehar District -,"Construction of buildings, Construction of bui..."
3,acn solutions for contracting,446844684,Saudi Contractor,Very Small Company Size,0 h,JEDDAH,Makkah,126688055.0,[email protected],jeddah al shiraa dis.,"Waste collection, treatment & disposal activit..."
4,Ratel Al Sharq Contracting Company,101010104,Saudi Contractor,Small Company Size,0 h,RIYADH,Riyadh,542332224.0,[email protected],Riyadh - Al Narjis District - Anas Bin Malik Road,"Construction of buildings, Construction of bui..."
5,Dome Park Contracting Company,251725170,Saudi Contractor,Small Company Size,0 h,AL MUWAYH AL JADID,Makkah,553518088.0,[email protected],um aldom,"Construction of buildings, Construction of bui..."
6,Al Shallal Al Arabi General Contracting Est,528152815,Saudi Contractor,Very Small Company Size,0 h,TABUK,Tabuk,546028644.0,[email protected],Saudi Arabia / Tabuk / North Faisaliyah / Ghal...,"Mining support services, Oil and natural gas e..."
7,ON Arabia Contracting Company,178417841,Saudi Contractor,Very Small Company Size,0 h,RIYADH,Riyadh,,[email protected],,No Data
8,Arabian Towers Projects Contracting Company,220622064,Saudi Contractor,Medium Company Size,0 h,DAMMAM,Eastern Province,599999999.0,[email protected],Arabian Towers Projects Contracting Company-76...,"Construction of buildings, Construction of bui..."
9,Rawasi Sama Contracting Company,305530557,Saudi Contractor,Very Small Company Size,9 h,HAFAR AL BATIN,Eastern Province,500012488.0,[email protected],"Al-Waleed bin Abdul-Malik, 3090\r\nAl-Masif Di...","Construction of buildings, Construction of bui..."


#### Handling Protected Emails

In [9]:
# Regular expression for email validation
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Initialize WebDriver with WebDriver Manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

email_data = []

try:
    for link in links:
        print(f"Processing: {link}")
        # Open the URL
        driver.get(link)

        # Extract company name
        try:
            company_name_element = driver.find_element(By.CLASS_NAME, "card-title")
            company_name = company_name_element.text.strip()
        except Exception as e:
            company_name = None
            print(f"Company Name not found for {link}: {e}")

        # Find all elements with the class that might contain the email
        elements = driver.find_elements(By.CLASS_NAME, "info-value")
        organization_email = None

        # Loop through the elements and look for valid emails
        for element in elements:
            text = element.text.strip()
            if re.match(EMAIL_REGEX, text):
                organization_email = text
                break  # Stop after finding the first email

        # Append data to the list
        email_data.append({
            "Company Name": company_name,
            "Organization Email": organization_email
        })

        time.sleep(1)

except Exception as e:
    print(f"Error: {e}")
finally:
    driver.quit()


# Create a DataFrame for email data
emails_df = pd.DataFrame(email_data)
emails_df.to_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/emails_data.csv', index=False)
print("Emails saved to emails_data.csv")


Processing: https://muqawil.org/en/contractors/20001947/143
Processing: https://muqawil.org/en/contractors/20014964/143
Processing: https://muqawil.org/en/contractors/20008518/143
Processing: https://muqawil.org/en/contractors/20045121/143
Processing: https://muqawil.org/en/contractors/20001440/143
Processing: https://muqawil.org/en/contractors/20019229/143
Processing: https://muqawil.org/en/contractors/20055348/143
Processing: https://muqawil.org/en/contractors/20010655/143
Processing: https://muqawil.org/en/contractors/20015654/143
Processing: https://muqawil.org/en/contractors/20026732/143
Processing: https://muqawil.org/en/contractors/20023122/143
Processing: https://muqawil.org/en/contractors/20001577/143
Processing: https://muqawil.org/en/contractors/20053278/143
Processing: https://muqawil.org/en/contractors/20044037/143
Processing: https://muqawil.org/en/contractors/20057392/143
Processing: https://muqawil.org/en/contractors/20060304/143
Processing: https://muqawil.org/en/contr

In [10]:
email = pd.read_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/emails_data.csv')
email

Unnamed: 0,Company Name,Organization Email
0,Al-Wessam Contracting Company,Alwessam7@Gmail.Com
1,Inmayoun Contracting Company,Inmayoun@Gmail.Com
2,Awared General Contracting Company,Fared@Fared-Est.Com
3,acn solutions for contracting,Alwa7ed@Hotmail.Com
4,Ratel Al Sharq Contracting Company,Adel_77@Hotmail.Com
5,Dome Park Contracting Company,Vv.Com838@Icloud.Com
6,Al Shallal Al Arabi General Contracting Est,Info@Alshallal-Alarabi.Com
7,ON Arabia Contracting Company,A1032500371@Gmail.Com
8,Arabian Towers Projects Contracting Company,Atpcosecretary@Atpco-Sa.Com
9,Rawasi Sama Contracting Company,Ce3@Hotmail.Com


#### Updating Organization Emails Based on Extracted Data

In [11]:
# Merge the two DataFrames based on 'Company Name'
merged_data = dfـcontractor_data.merge(emails_df[['Company Name', 'Organization Email']],
                                     on='Company Name',
                                     how='left',
                                     suffixes=('', '_new'))

# Replace the old 'Organization Email' with the new one
merged_data['Organization Email'] = merged_data['Organization Email_new']

# Drop the temporary column used for the merge
merged_data.drop(columns=['Organization Email_new'], inplace=True)

# Save the updated contractors data to a new CSV file
merged_data.to_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/Updated_contractors_data.csv', index=False)

print("Updated data saved to Updated_contractors_data.csv")


Updated data saved to Updated_contractors_data.csv


In [12]:
Updated_contractors_data = pd.read_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/Updated_contractors_data.csv')
Updated_contractors_data

Unnamed: 0,Company Name,Membership Number,Membership Type,Company Size,Training Credit Hours,City,Region,Organization Mobile Number,Organization Email,Address,Interests
0,Al-Wessam Contracting Company,106110611,Saudi Contractor,Small Company Size,93 h,BILJURASHI,Bahah,505643337.0,Alwessam7@Gmail.Com,Al Wessam Contracting Company,"Mining support services, Oil and natural gas e..."
1,Inmayoun Contracting Company,215421544,Saudi Contractor,Medium Company Size,3 h,RIYADH,Riyadh,555090521.0,Inmayoun@Gmail.Com,abdelaziz,"Construction of buildings, Construction of bui..."
2,Awared General Contracting Company,160916095,Saudi Contractor,Small Company Size,90 h,RIYADH,Riyadh,112090111.0,Fared@Fared-Est.Com,Riyadh - alezdehar District -,"Construction of buildings, Construction of bui..."
3,acn solutions for contracting,446844684,Saudi Contractor,Very Small Company Size,0 h,JEDDAH,Makkah,126688055.0,Alwa7ed@Hotmail.Com,jeddah al shiraa dis.,"Waste collection, treatment & disposal activit..."
4,Ratel Al Sharq Contracting Company,101010104,Saudi Contractor,Small Company Size,0 h,RIYADH,Riyadh,542332224.0,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,"Construction of buildings, Construction of bui..."
5,Dome Park Contracting Company,251725170,Saudi Contractor,Small Company Size,0 h,AL MUWAYH AL JADID,Makkah,553518088.0,Vv.Com838@Icloud.Com,um aldom,"Construction of buildings, Construction of bui..."
6,Al Shallal Al Arabi General Contracting Est,528152815,Saudi Contractor,Very Small Company Size,0 h,TABUK,Tabuk,546028644.0,Info@Alshallal-Alarabi.Com,Saudi Arabia / Tabuk / North Faisaliyah / Ghal...,"Mining support services, Oil and natural gas e..."
7,ON Arabia Contracting Company,178417841,Saudi Contractor,Very Small Company Size,0 h,RIYADH,Riyadh,,A1032500371@Gmail.Com,,No Data
8,Arabian Towers Projects Contracting Company,220622064,Saudi Contractor,Medium Company Size,0 h,DAMMAM,Eastern Province,599999999.0,Atpcosecretary@Atpco-Sa.Com,Arabian Towers Projects Contracting Company-76...,"Construction of buildings, Construction of bui..."
9,Rawasi Sama Contracting Company,305530557,Saudi Contractor,Very Small Company Size,9 h,HAFAR AL BATIN,Eastern Province,500012488.0,Ce3@Hotmail.Com,"Al-Waleed bin Abdul-Malik, 3090\r\nAl-Masif Di...","Construction of buildings, Construction of bui..."


### Extracting Required Data Only


In [13]:
# Define the required columns based on the requested fields
required_columns = [
    "Company Name",
    "Membership Number",
    "Company Size",
    "Organization Email",
    "Address",
    "City",
    "Region",
    "Interests"
]

# Filter the DataFrame to include only the required columns
filtered_df = Updated_contractors_data[required_columns]

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/filtered_contractors_data.csv', index=False)

print("Filtered data saved to filtered_contractors_data.csv")


Filtered data saved to filtered_contractors_data.csv


In [14]:
df = pd.read_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/filtered_contractors_data.csv')

In [15]:
df

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests
0,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e..."
1,Inmayoun Contracting Company,215421544,Medium Company Size,Inmayoun@Gmail.Com,abdelaziz,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
2,Awared General Contracting Company,160916095,Small Company Size,Fared@Fared-Est.Com,Riyadh - alezdehar District -,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
3,acn solutions for contracting,446844684,Very Small Company Size,Alwa7ed@Hotmail.Com,jeddah al shiraa dis.,JEDDAH,Makkah,"Waste collection, treatment & disposal activit..."
4,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
5,Dome Park Contracting Company,251725170,Small Company Size,Vv.Com838@Icloud.Com,um aldom,AL MUWAYH AL JADID,Makkah,"Construction of buildings, Construction of bui..."
6,Al Shallal Al Arabi General Contracting Est,528152815,Very Small Company Size,Info@Alshallal-Alarabi.Com,Saudi Arabia / Tabuk / North Faisaliyah / Ghal...,TABUK,Tabuk,"Mining support services, Oil and natural gas e..."
7,ON Arabia Contracting Company,178417841,Very Small Company Size,A1032500371@Gmail.Com,,RIYADH,Riyadh,No Data
8,Arabian Towers Projects Contracting Company,220622064,Medium Company Size,Atpcosecretary@Atpco-Sa.Com,Arabian Towers Projects Contracting Company-76...,DAMMAM,Eastern Province,"Construction of buildings, Construction of bui..."
9,Rawasi Sama Contracting Company,305530557,Very Small Company Size,Ce3@Hotmail.Com,"Al-Waleed bin Abdul-Malik, 3090\r\nAl-Masif Di...",HAFAR AL BATIN,Eastern Province,"Construction of buildings, Construction of bui..."


### Dealing with missing values

In [16]:
# Replace NaN values with 'Not Available'
df.fillna('Not Available', inplace=True)


In [17]:
df = df.replace('.', 'Not Available')
df = df.replace('No Data', 'Not Available')


In [18]:
df

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests
0,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e..."
1,Inmayoun Contracting Company,215421544,Medium Company Size,Inmayoun@Gmail.Com,abdelaziz,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
2,Awared General Contracting Company,160916095,Small Company Size,Fared@Fared-Est.Com,Riyadh - alezdehar District -,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
3,acn solutions for contracting,446844684,Very Small Company Size,Alwa7ed@Hotmail.Com,jeddah al shiraa dis.,JEDDAH,Makkah,"Waste collection, treatment & disposal activit..."
4,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
5,Dome Park Contracting Company,251725170,Small Company Size,Vv.Com838@Icloud.Com,um aldom,AL MUWAYH AL JADID,Makkah,"Construction of buildings, Construction of bui..."
6,Al Shallal Al Arabi General Contracting Est,528152815,Very Small Company Size,Info@Alshallal-Alarabi.Com,Saudi Arabia / Tabuk / North Faisaliyah / Ghal...,TABUK,Tabuk,"Mining support services, Oil and natural gas e..."
7,ON Arabia Contracting Company,178417841,Very Small Company Size,A1032500371@Gmail.Com,Not Available,RIYADH,Riyadh,Not Available
8,Arabian Towers Projects Contracting Company,220622064,Medium Company Size,Atpcosecretary@Atpco-Sa.Com,Arabian Towers Projects Contracting Company-76...,DAMMAM,Eastern Province,"Construction of buildings, Construction of bui..."
9,Rawasi Sama Contracting Company,305530557,Very Small Company Size,Ce3@Hotmail.Com,"Al-Waleed bin Abdul-Malik, 3090\r\nAl-Masif Di...",HAFAR AL BATIN,Eastern Province,"Construction of buildings, Construction of bui..."


In [19]:
df.to_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/filtered_contractors_data.csv', index=False)


In [20]:
df2= pd.read_csv('/Users/afnanalamri/Desktop/MyProject/LeapAI/WebCrawlingTask/raw_data/filtered_contractors_data.csv')

In [21]:
df2

Unnamed: 0,Company Name,Membership Number,Company Size,Organization Email,Address,City,Region,Interests
0,Al-Wessam Contracting Company,106110611,Small Company Size,Alwessam7@Gmail.Com,Al Wessam Contracting Company,BILJURASHI,Bahah,"Mining support services, Oil and natural gas e..."
1,Inmayoun Contracting Company,215421544,Medium Company Size,Inmayoun@Gmail.Com,abdelaziz,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
2,Awared General Contracting Company,160916095,Small Company Size,Fared@Fared-Est.Com,Riyadh - alezdehar District -,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
3,acn solutions for contracting,446844684,Very Small Company Size,Alwa7ed@Hotmail.Com,jeddah al shiraa dis.,JEDDAH,Makkah,"Waste collection, treatment & disposal activit..."
4,Ratel Al Sharq Contracting Company,101010104,Small Company Size,Adel_77@Hotmail.Com,Riyadh - Al Narjis District - Anas Bin Malik Road,RIYADH,Riyadh,"Construction of buildings, Construction of bui..."
5,Dome Park Contracting Company,251725170,Small Company Size,Vv.Com838@Icloud.Com,um aldom,AL MUWAYH AL JADID,Makkah,"Construction of buildings, Construction of bui..."
6,Al Shallal Al Arabi General Contracting Est,528152815,Very Small Company Size,Info@Alshallal-Alarabi.Com,Saudi Arabia / Tabuk / North Faisaliyah / Ghal...,TABUK,Tabuk,"Mining support services, Oil and natural gas e..."
7,ON Arabia Contracting Company,178417841,Very Small Company Size,A1032500371@Gmail.Com,Not Available,RIYADH,Riyadh,Not Available
8,Arabian Towers Projects Contracting Company,220622064,Medium Company Size,Atpcosecretary@Atpco-Sa.Com,Arabian Towers Projects Contracting Company-76...,DAMMAM,Eastern Province,"Construction of buildings, Construction of bui..."
9,Rawasi Sama Contracting Company,305530557,Very Small Company Size,Ce3@Hotmail.Com,"Al-Waleed bin Abdul-Malik, 3090\r\nAl-Masif Di...",HAFAR AL BATIN,Eastern Province,"Construction of buildings, Construction of bui..."
