<a href="https://colab.research.google.com/github/arnabksarkar/LLMFinetuning/blob/dev/Webcrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Extract Webpage

In [None]:

def extract_text_from_webpage(url):
    """Extracts text content from a given webpage URL and saves it to a text file."""

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string if soup.title else "Webpage_NO_Title" + random.randint(10, 99)

        print(f"Title of the webpage: {title}")

        # Extract all text from <p> tags
        text_content = " ".join([p.get_text() for p in soup.find_all('p')])

        # Save the extracted text to a file
        with open('all_bengali_culture_texts/' + title + '.txt', 'w', encoding='utf-8') as file:
            file.write(text_content)

        print(f"Text extracted and saved for {title}")

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

### Extract Links from the text

In [None]:
def extract_links_from_webpage(url):
    """Extracts all links from a given webpage URL."""

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')
        body = soup.find("div", {"id": "bodyContent"})
        if body:
            links = [a['href'] for a in body.find_all('a', href=True)]
        else:
            links = []

        return links

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []


### Set how many depths you want to go

In [None]:
# ### Set how many depths you want to go
maxDepth = 5 # Set the maximum depth to traverse child pages

baseURL = 'https://en.wikipedia.org/wiki/Culture_of_Bengal'



allLinks = []
visitedLinks = set()  # To keep track of visited links

def crawl_and_extract_links(url, depth):
    if depth > maxDepth or url in visitedLinks:
        return
    visitedLinks.add(url)

    try:
        links = extract_links_from_webpage(url)
        for link in links:
            if link.endswith('.jpg') or link.endswith('.png') or link.endswith('.gif') or link.endswith('.svg'):
                continue
            if link.startswith('#'):
                continue
            if '/wiki/Category:' in link:
                continue
            if '/Help:Category' in link:
                continue
            if link.startswith('/'):
                link = 'https://en.wikipedia.org' + link
            allLinks.append(link)
            visitedLinks.add(link)
            crawl_and_extract_links(link, depth + 1)
    except Exception as e:
        print(f"Error processing {url}: {e}")



In [None]:
crawl_and_extract_links(baseURL, 0)
allLinks = list(set(allLinks))
allLinks.append(baseURL)
print("Extracted Links size:" + str(len(allLinks)))

# for link in allLinks:
#    print(link)

### Lets Extract Data

In [None]:
for link in allLinks:
    extract_text_from_webpage(link)

### Lets read all the contents from the folder

In [4]:
import shutil
import os

folder_path = '/content/drive/MyDrive/data_bengali_wiki'

# Check if the folder exists
if not os.path.exists(folder_path):
    print(f"Error: Folder '{folder_path}' not found.")
else:
    for filename in os.listdir(folder_path):
        if "Bangladesh" in filename:
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path):
                    os.remove(file_path)
                    print(f"Deleted file: {filename}")
                else:
                    shutil.rmtree(file_path) # Delete if its a directory
                    print(f"Deleted directory: {filename}")

            except Exception as e:
                print(f"Error deleting {filename}: {e}")
        else:
            print(f"File '{filename}' does not contain 'Bangladesh', not deleting")

File 'Amar Sonar Bangla - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Allauddin Khan - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Alstonia scholaris - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Alipurduar district - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Advocacy group - Wikipedia.txt' does not contain 'Bangladesh', not deleting
Deleted file: Agriculture in Bangladesh - Wikipedia.txt
File 'Alipore - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Ali Akbar Khan - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'All-rounder - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Administrative divisions of West Bengal - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File 'Adivasi - Wikipedia.txt' does not contain 'Bangladesh', not deleting
File '1954 East Bengal Legislative Assembly election - Wikipedia.txt' does not contain 'Bangladesh', not