In [180]:
!pip install requests bs4



In [181]:
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [182]:
base_url = 'https://www.nu.edu.pk/'
visited_links = set()
output_data = []

In [183]:
def extract_data(url):
    try:
        response = requests.get(url, timeout=5)  # Set a timeout value of 5 seconds
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.find('title').get_text()
            text = ' '.join([p.get_text() for p in soup.find_all('p')])

            return title, text
        else:
            print(f"Error accessing {url}: Status Code {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")

    return None, None

In [184]:
def clean_url(url):
    # Remove any query parameters and fragments from the URL
    parsed_url = urlparse(url)
    cleaned_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
    return cleaned_url

In [185]:
# def web_crawler(url, depth=2):
#     if depth <= 0 or url in visited_links:
#         return

#     visited_links.add(url)
#     title, text, campus_locations, courses = extract_data(url)
#     if title and text:
#         output_data.append([title, text, campus_locations, courses])

#     try:
#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             links = soup.find_all('a', href=True)

#             for link in links:
#                 next_url = link['href']
#                 if next_url.startswith('/'):
#                     next_url = base_url + next_url
#                 web_crawler(next_url, depth - 1)
#     except Exception as e:
#         print(f"Error accessing {url}: {e}")

In [186]:
# def web_crawler(url, depth=2):
#     if depth <= 0 or url in visited_links:
#         return

#     visited_links.add(url)
#     title, text, campus_locations, courses = extract_data(url)
#     if title and text and url not in visited_links:
#         output_data.append([title, text, campus_locations, courses])

#     try:
#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             links = soup.find_all('a', href=True)

#             for link in links:
#                 next_url = link['href']
#                 if next_url.startswith('/'):
#                     next_url = base_url + next_url
#                 web_crawler(next_url, depth - 1)
#     except Exception as e:
#         print(f"Error accessing {url}: {e}")

In [187]:
def web_crawler(url, total_limit=float('inf'), per_page_limit=10, depth=2):
    if depth <= 0 or url in visited_links or len(visited_links) >= total_limit:
        return

    print("Scraping:", url)  # Display the URL being accessed

    cleaned_url = clean_url(url)
    visited_links.add(cleaned_url)
    title, text = extract_data(cleaned_url)
    if title and text and text not in [data[1] for data in output_data]:
        output_data.append([title, text])

    try:
        response = requests.get(url)
        if response.status_code == 200 and len(visited_links) < total_limit:
            soup = BeautifulSoup(response.content, 'html.parser')
            links = soup.find_all('a', href=True)

            for link in links:
                next_url = urljoin(url, link['href'])
                if next_url.startswith('/'):
                    next_url = urljoin(base_url, link['href'])
                if next_url not in visited_links and len(visited_links) < total_limit:
                    web_crawler(next_url, total_limit, per_page_limit, depth - 1)
    except Exception as e:
        print(f"Error accessing {url}: {e}")

In [188]:
starting_url = 'https://www.nu.edu.pk/'
total_limit = float('inf')  # Set the total number of links to be scraped to a large value
per_page_limit = 10  # Set the maximum number of links to be scraped from each page
web_crawler(starting_url, total_limit, per_page_limit)

Scraping: https://www.nu.edu.pk/
Scraping: https://www.nu.edu.pk/Home
Scraping: https://www.nu.edu.pk/Degree-Programs
Scraping: https://www.nu.edu.pk/Admissions/Schedule
Scraping: https://www.nu.edu.pk/Admissions/HowToApply
Scraping: https://www.nu.edu.pk/Admissions/EligibilityCriteria
Scraping: https://www.nu.edu.pk/Admissions/Scholarship
Scraping: https://www.nu.edu.pk/Admissions/TestPattern
Scraping: https://www.nu.edu.pk/Admissions/FeeStructure
Scraping: https://www.nu.edu.pk/Admissions/Prospectus
Scraping: http://cfd.nu.edu.pk
Error accessing http://cfd.nu.edu.pk: Status Code 403
Scraping: http://cfd.nu.edu.pk/all-departments
Error accessing http://cfd.nu.edu.pk/all-departments: Status Code 403
Scraping: https://www.nu.edu.pk/Campus/Chiniot-Faisalabad/PhdSupervisors
Scraping: https://www.nu.edu.pk/Campus/Chiniot-Faisalabad/Events
Scraping: https://www.nu.edu.pk/Campus/Chiniot-Faisalabad/MedalHolders
Scraping: https://www.nu.edu.pk/Campus/Chiniot-Faisalabad/RectorLists
Scraping: ht

In [189]:
# Display the dataset
for item in output_data:
    title, text = item
    print("Title:", title)
    print("Text:", text)
    print("-" * 50)

Title: FAST National University
Text:  
                    Founded as a Federally Chartered University in July 2000, the National University of Computer and Emerging Sciences is a premiere University of Pakistan, renowned for quality and impact of its students in the development of local software and other industries. The university has five modern campuses at Karachi, Lahore, Islamabad, Peshawar and Chiniot-Faisalabad. These campuses provide world class educational environment and recreational facilities to about over 11,000 students, around one quarter are female and over 500 skilled faculty members.
                 Research wings of the university are well recognized, nationally and internationally. They are embarked upon cutting edge research having direct impact on the social, economic and technological needs of Pakistan. Our vision is to become a globally recognized research university of Pakistan within the next decade. FAST-House Rohtas Road, G-9/4 Islamabad - 44000
-------

In [190]:
# Export data to CSV
with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Title', 'Text'])
    csv_writer.writerows(output_data)