In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time
from tqdm import tqdm


In [11]:
# Define the base URL for the specific pages to crawl
similar_url = 'https://um.edu.my/news/indexnews.php?year='
years = range(2013, 2025)  # From 2013 to 2024
base_url='https://um.edu.my/news/'
# Generate the list of URLs to start crawling from
urls = [f"{similar_url}{year}" for year in years]
visited_url = set()
failed_url = set()

In [12]:
urls

['https://um.edu.my/news/indexnews.php?year=2013',
 'https://um.edu.my/news/indexnews.php?year=2014',
 'https://um.edu.my/news/indexnews.php?year=2015',
 'https://um.edu.my/news/indexnews.php?year=2016',
 'https://um.edu.my/news/indexnews.php?year=2017',
 'https://um.edu.my/news/indexnews.php?year=2018',
 'https://um.edu.my/news/indexnews.php?year=2019',
 'https://um.edu.my/news/indexnews.php?year=2020',
 'https://um.edu.my/news/indexnews.php?year=2021',
 'https://um.edu.my/news/indexnews.php?year=2022',
 'https://um.edu.my/news/indexnews.php?year=2023',
 'https://um.edu.my/news/indexnews.php?year=2024']

In [13]:
def has_no_extension(url):
    path = urlparse(url).path
    return os.path.splitext(path)[1] == ""

In [14]:
current = time.time()
with tqdm(total=len(urls), desc="Crawling") as pbar:
    while urls:
        current = urls.pop()
        try:
            response = requests.get(current)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            visited_url.add(current)
            for link in soup.find_all('a', href=True):
                full_url = urljoin(base_url, link['href'])
                if (full_url not in visited_url and
                        full_url not in urls and
                        full_url.startswith(base_url) and
                        full_url not in failed_url and
                        has_no_extension(full_url)):
                    urls.append(full_url)
                    pbar.total += 1
                    pbar.update(0)  # Reset the progress bar length

        except requests.RequestException as e:
            failed_url.add(current)
            print(f"Error fetching or processing the page: {e}")

        pbar.update(1)

print(f"Time taken: {time.time() - current:.2f} seconds")

Crawling:   7%|████▊                                                                    | 3/46 [00:03<00:05,  8.18it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/um-services


Crawling:   9%|██████▎                                                                  | 4/46 [00:03<00:14,  2.81it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/office-directory-academic


Crawling:  11%|███████▉                                                                 | 5/46 [00:04<00:20,  1.96it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/um-fact-sheet


Crawling:  15%|███████████                                                              | 7/46 [00:05<00:18,  2.11it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/history
Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/vision-amp-mission


Crawling:  17%|████████████▋                                                            | 8/46 [00:06<00:21,  1.74it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/alumni


Crawling:  20%|██████████████▎                                                          | 9/46 [00:07<00:24,  1.53it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/staff


Crawling:  22%|███████████████▋                                                        | 10/46 [00:08<00:25,  1.40it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/student


Crawling:  74%|█████████████████████████████████████████████████████▏                  | 34/46 [00:30<00:09,  1.27it/s]

Error fetching or processing the page: 403 Client Error: Forbidden for url: https://um.edu.my/news/


Crawling:  76%|██████████████████████████████████████████████████████▊                 | 35/46 [00:30<00:08,  1.25it/s]

Error fetching or processing the page: 403 Client Error: Forbidden for url: https://um.edu.my/news/#kingster-mobile-menu


Crawling:  90%|██████████████████████████████████████████████████████████████▊       | 139/155 [02:10<00:13,  1.22it/s]

Error fetching or processing the page: 403 Client Error: Forbidden for url: https://um.edu.my/news/#%5B1%5D


Crawling:  83%|██████████████████████████████████████████████████████████▍           | 172/206 [02:50<00:47,  1.39s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/uma-euro-trade-s-social-wellbeing-research-centre-launches-book-on-redefining-the-meaning-of-retirement


Crawling:  90%|██████████████████████████████████████████████████████████████▊       | 185/206 [03:02<00:20,  1.01it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/la-euro-trade-ora-copy-al-unesco-for-women-in-science-national-fellowship-to-award-inspiring-women-scientists-of-tomorrow


Crawling:  92%|████████████████████████████████████████████████████████████████▌     | 190/206 [03:08<00:17,  1.10s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/um-johns-hopkins-berman-institute-of-bioethics-host-public-conference-a-euro-tilde-getting-the-ethics-of-genome-editing-right-engaging-multiple-perspectivea-euro-trade


Crawling:  84%|██████████████████████████████████████████████████████████▌           | 206/246 [03:24<00:30,  1.31it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/uma-euro-trade-s-spin-off-flexilicate-joins-hand-with-dura-mine-to-collaborate-on-optical-fibre-manufacturing


Crawling:  88%|█████████████████████████████████████████████████████████████▍        | 216/246 [03:31<00:22,  1.35it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/um-mcmh-advocate-mena-euro-trade-s-health


Crawling:  93%|█████████████████████████████████████████████████████████████████▍    | 230/246 [03:44<00:12,  1.29it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/um-food-microbiologist-receives-la-euro-trade-oreal-unesco-for-women-in-science-award


Crawling:  88%|█████████████████████████████████████████████████████████████▊        | 249/282 [04:04<00:34,  1.05s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/book-distribution-ceremony-under-the-a-euro-tilde-books-for-asiaa-euro-trade-programme-and-inpumaa-euro-trade-s-aidilfitri-gathering


Crawling:  92%|████████████████████████████████████████████████████████████████▌     | 260/282 [04:13<00:15,  1.45it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/yb-dato-sri-dr-haji-wan-junaidi-tuanku-jaafar-delivers-lecture-at-syarahan-zaa-euro-trade-ba-2017


Crawling:  94%|█████████████████████████████████████████████████████████████████▌    | 264/282 [04:14<00:09,  1.85it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/ummc-kicks-off-breast-cancer-awareness-month-through-a-euro-oelig-show-you-care-be-awarea-euro-breast-cancer-campaign


Crawling:  98%|████████████████████████████████████████████████████████████████████▌ | 276/282 [04:25<00:04,  1.40it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/setting-record-for-malaysiaa-euro-trade-s-longest-concrete-canoe-at-malaysian-concrete-canoe-competition-2017


Crawling:  89%|██████████████████████████████████████████████████████████████▏       | 285/321 [04:38<00:37,  1.05s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/a-euro-tilde-sancheza-euro-trade-guns-down-labyrinth-111-in-style


Crawling:  89%|██████████████████████████████████████████████████████████████▎       | 286/321 [04:38<00:34,  1.02it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/watt-da-euro-trade-or-award-exhibition


Crawling:  91%|███████████████████████████████████████████████████████████████▍      | 291/321 [04:44<00:31,  1.04s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/modern-flashback-a-euro-ldquo-scoot-sound-and-style-modxibition


Crawling:  93%|████████████████████████████████████████████████████████████████▉     | 298/321 [04:50<00:16,  1.42it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/textile-tales-of-pua-kumbu-a-euro-ldquo-a-polysensory-intermedia-exhibition


Crawling:  94%|█████████████████████████████████████████████████████████████████▋    | 301/321 [04:53<00:18,  1.10it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/uncovering-the-secrets-of-cultivating-naturea-euro-trade-s-a-euro-tilde-red-golda-euro


Crawling:  96%|███████████████████████████████████████████████████████████████████▍  | 309/321 [05:01<00:11,  1.02it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/rousing-welcome-for-university-of-malayaa-euro-trade-s-springboard-hero


Crawling:  99%|█████████████████████████████████████████████████████████████████████▏| 317/321 [05:09<00:04,  1.05s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/korea-corner-a-euro-ldquo-an-attractive-cozy-and-resourceful-place-about-korea


Crawling:  90%|███████████████████████████████████████████████████████████████▏      | 326/361 [05:19<00:36,  1.05s/it]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/tham-kuen-wei-ngoi-eva-sapu-bersih-pertandingan-a-euro-oelig-international-feasibility-studya-euro


Crawling:  98%|████████████████████████████████████████████████████████████████████▋ | 354/361 [05:41<00:05,  1.23it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/a-euro-oelig-travel-fitness-asia-with-dee-deea-euro-gabung-konsep-melancong-aktiviti-kecergasan-dan-terokai-makanan


Crawling:  99%|█████████████████████████████████████████████████████████████████████▍| 358/361 [05:46<00:02,  1.00it/s]

Error fetching or processing the page: 404 Client Error: Not Found for url: https://um.edu.my/news/um-5gb-wifi-quota-an-a-euro-oelig-upgradea-euro-from-current-3gb


Crawling: 100%|██████████████████████████████████████████████████████████████████████| 362/362 [05:49<00:00,  1.04it/s]


TypeError: unsupported operand type(s) for -: 'float' and 'str'

In [16]:
len(visited_url)

331

In [3]:
response = requests.get('https://um.edu.my/news/indexnews.php?year=2013')

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
base_url = 'https://um.edu.my/news/indexnews.php?year='
for link in soup.find_all('a', href=True):
    full_url = urljoin(base_url, link['href'])
    for link in soup.find_all('a', href=True):
        full_url = urljoin(base_url, link['href'])

https://um.edu.my/index
https://um.edu.my/news/#kingster-mobile-menu
https://um.edu.my/news/
https://um.edu.my/vision-amp-mission
https://ppsg.um.edu.my/strategic-planning
https://um.edu.my/educational-goals
https://um.edu.my/client-charter
https://um.edu.my/our-history
https://um.edu.my/news/
https://um.edu.my/chancellor-amp-pro-chancellors
https://um.edu.my/former-vice-chancellors
https://um.edu.my/board-of-directors
https://um.edu.my/top-management
https://um.edu.my/deans-amp-directors-academics
https://um.edu.my/chancellery
https://um.edu.my/registrar-rsquo-s-office
https://um.edu.my/deputy-vice-chancellor-rsquo-s-office
https://um.edu.my/bursary-main-office
https://um.edu.my/library
https://um.edu.my/docs/CARTA ORGANISASI UM_BI.pdf
https://um.edu.my/news/
https://um.edu.my/symphony-orchestra
https://um.edu.my/botanical-gardens
https://um.edu.my/experimental-farm
https://um.edu.my/sustainability-um
https://um.edu.my/community-engagement
https://um.edu.my/treasure-um
https://um.edu.

In [17]:
with open('links-umnnews.txt', 'w') as file:
    for item in visited_url:
        file.write(f"{item}\n")


In [21]:
with open('links-umnews.txt', 'r') as file:
    urls = [url.strip() for url in file.readlines()]

In [23]:
def filter_indexnews_links(urls):
    return [url for url in urls if 'indexnews.php' not in url]

In [24]:
newUrls=filter_indexnews_links(urls)

In [26]:
len(newUrls)

319

In [27]:
with open('links-umnews.txt', 'w') as file:
    for item in newUrls:
        file.write(f"{item}\n")