| **Author**          | **Roll No**   | **Version** |
|---------------------|----------------|--------------|
| Abhyudaya Nair      | 24210005      | 1.0          |

### Link Crawling

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from time import sleep

# Set up Chrome to run in headless mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless")  # Run Chrome in headless mode, which can help in some environments

# Specify the location of the Chrome binary
chrome_options.binary_location = "/usr/bin/google-chrome"  # Adjust this path if Chrome is installed elsewhere

driver = webdriver.Chrome(options=chrome_options)


base_url = "https://www.bbc.com"
driver.get(base_url)

# 1. Identify all the main sections from the homepage
sections_links = []
print("Identifying main topics (sections) from the homepage...")

# Main sections typically appear in the navigation bar or other major links
main_sections = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/news"], a[href^="/sport"], a[href^="/business"], a[href^="/travel"], a[href^="/culture"]')
for section in main_sections:
    link = section.get_attribute("href")
    if link.startswith(base_url) and link not in sections_links:
        sections_links.append(link)

print(f"Found {len(sections_links)} main sections to explore.")

# 2. Visit each section to identify subsections
article_links = []

for section_url in sections_links:
    print(f"\nVisiting section: {section_url}")
    driver.get(section_url)
    sleep(3)

    # 3. Extract subsections from the section page
    subsections_links = []
    subsections = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/news"], a[href^="/sport"], a[href^="/business"], a[href^="/travel"], a[href^="/culture"]')
    
    for subsection in subsections:
        subsection_link = subsection.get_attribute("href")
        if subsection_link.startswith(base_url) and subsection_link not in subsections_links:
            subsections_links.append(subsection_link)

    print(f"Found {len(subsections_links)} subsections under {section_url}.")

    # 4. Visit each subsection
    for subsection_url in subsections_links:
        print(f"\nVisiting subsection: {subsection_url}")
        driver.get(subsection_url)
        sleep(3)

        # 5. Pagination: Go through each page in the subsection
        while True:
            print(f"Collecting article links from page: {driver.current_url}")
            # Extract article links from the page
            articles = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/news/"], a[href*="/sport/"], a[href*="/business/"], a[href*="/travel/"], a[href*="/culture/"]')  # Adjust if other article patterns exist
            for article in articles:
                article_link = article.get_attribute("href")
                if article_link not in article_links:
                    article_links.append(article_link)
                    print(f"Extracted article link: {article_link}")
            
            # Check if there is a "next" button to go to the next page
            try:
                next_button = driver.find_elements(By.CSS_SELECTOR, '[data-testid="pagination-next-button"]')
                if len(next_button) > 0 and next_button[0].is_enabled():
                    next_button[0].click()
                    sleep(3)  # Wait for the next page to load
                else:
                    print(f"No more pages or 'Next' button disabled in subsection: {subsection_url}")
                    break
            except Exception as e:
                print(f"Error during pagination: {e}")
                break

    print(f"Finished collecting articles for section: {section_url}")

# Close the driver
driver.quit()

# 6. Save all collected links to a CSV file
df = pd.DataFrame(article_links, columns=["Article Links"])
df.to_csv("bbc_article_links.csv", index=False)

print(f"\nTotal unique article links extracted: {len(article_links)}")


### Text Extraction

In [None]:
import csv
import requests
import os
from bs4 import BeautifulSoup

# File paths
csv_file_path = 'Abhyudaya/Scripts and Data/bbc_article_links.csv'
visited_links_file = 'Abhyudaya/Scripts and Data/BBC_visited_links.txt'
txt_files_directory = 'Abhyudaya/Text Files/bbc'

# Ensure the directory exists
if not os.path.exists(txt_files_directory):
    os.makedirs(txt_files_directory)

# Read the links from the CSV file
def get_links(csv_file_path):
    with open(csv_file_path, newline='', encoding='utf-8') as file:
        return [row[0] for row in csv.reader(file) if row]

# Read the visited links from visited_links.txt
def get_visited_links(visited_links_file):
    if os.path.exists(visited_links_file):
        with open(visited_links_file, 'r', encoding='utf-8') as file:
            return set(line.strip() for line in file)
    return set()

# Write the visited link to visited_links.txt
def add_to_visited(link):
    with open(visited_links_file, 'a', encoding='utf-8') as file:
        file.write(link + '\n')

# Fetch content from the link and extract the relevant text using BeautifulSoup
def fetch_and_extract_text(link):
    try:
        response = requests.get(link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all divs with data-component="text-block"
        text_blocks = soup.find_all('div', {'data-component': 'text-block'})
        if not text_blocks:
            print(f"No valid text-block found in {link}, skipping.")
            return None

        # Extract text from all <p> tags inside the divs
        paragraphs = []
        for block in text_blocks:
            paragraphs.extend([p.get_text() for p in block.find_all('p')])

        return '\n'.join(paragraphs)  # Combine all paragraphs into a single string
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {link}: {e}")
        return None

# Save content to a txt file in the specified directory
def save_to_file(content, file_number):
    file_path = os.path.join(txt_files_directory, f'{file_number}.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# The main logic to process the links
links = get_links(csv_file_path)
visited_links = get_visited_links(visited_links_file)

for index, link in enumerate(links):
    if link in visited_links:
        print(f"Skipping {link}, already visited.")
        continue

    content = fetch_and_extract_text(link)
    if content:
        save_to_file(content, index + 1)  # Save as 1.txt, 2.txt, etc.
        add_to_visited(link)  # Mark link as visited
        print(f"Saved content from {link} to {txt_files_directory}/{index + 1}.txt")
