| **Author**          | **Roll No**   | **Version** |
|---------------------|----------------|--------------|
| Abhyudaya Nair      | 24210005      | 1.0          |

### Link Crawling

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import os
from urllib.parse import urljoin, urlparse

# Define the base URL
base_url = "https://edition.cnn.com/"

# Define the file to store article links
csv_file = 'Abhyudaya/Scripts and Data/cnn_article_links.csv'
progress_file = 'Abhyudaya/Scripts and Data/cnn_links_to_traverse.txt'  # File to store progress

# Store visited links to avoid reprocessing and duplicates
visited_links = set()
links_to_traverse = []  # Initialize as empty list
article_links = set()  # To store unique article links

# Write the article link to CSV
def save_article_link(link):
    with open(csv_file, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([link])

# Function to fetch and parse the HTML content
def get_html_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        response.encoding = response.apparent_encoding  # Set encoding based on response
        return response.content  # Return raw content for parsing
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return None

# Extracts all unique links from a given page
def extract_links(soup, base_url):
    links = set()
    for anchor in soup.find_all('a', href=True):
        href = anchor['href']
        full_url = urljoin(base_url, href)  # Construct full URL

        # Ensure the link belongs to edition.cnn.com and is a valid HTTP link
        if "edition.cnn.com" in urlparse(full_url).netloc and full_url.startswith("http"):
            links.add(full_url)
    return links

# Process the current URL and extract new links
def process_link(url):
    raw_html = get_html_content(url)
    if raw_html is None:
        return

    try:
        soup = BeautifulSoup(raw_html, 'html.parser')  # Attempt to parse HTML
    except Exception as e:
        print(f"Parsing error for {url}: {e}")
        return  # Skip this link and continue

    # Extract all unique links from the current page
    links = extract_links(soup, url)

    # Filter and save article links that start with "https://edition.cnn.com"
    for link in links:
        if link.startswith("https://edition.cnn.com") and link not in article_links:
            article_links.add(link)
            save_article_link(link)
            print(f"Saved article link: {link}")

    # Add unvisited links to the links_to_traverse list
    for link in links:
        if link not in visited_links:
            links_to_traverse.append(link)

# Load previously saved links from the progress file
def load_progress():
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as file:
            return [line.strip() for line in file.readlines()]
    return []

# Save the current state of links_to_traverse to the progress file
def save_progress():
    with open(progress_file, 'w') as file:
        for link in links_to_traverse:
            file.write(link + '\n')

# Main crawling loop
links_to_traverse = load_progress() or [base_url]  # Load progress or start fresh

while links_to_traverse:
    current_link = links_to_traverse.pop(0)
    if current_link not in visited_links:
        print(f"Processing: {current_link}")
        visited_links.add(current_link)
        process_link(current_link)
        
        # Save progress after processing each link
        save_progress()

# Final save of progress when done
save_progress()
print("Crawling complete. Progress saved.")


Processing: http://edition.cnn.com/2021/01/28/opinions/us-must-be-prepared-to-face-domestic-terror-panetta/index.html
Saved article link: https://edition.cnn.com/business/markets-now
Saved article link: https://edition.cnn.com/us
Saved article link: https://edition.cnn.com/business/tech/mission-ahead
Saved article link: https://edition.cnn.com/world/united-kingdom
Saved article link: https://edition.cnn.com/ad-choices
Saved article link: https://edition.cnn.com/travel/videos
Saved article link: https://edition.cnn.com/travel
Saved article link: https://edition.cnn.com/business/work-transformed
Saved article link: https://edition.cnn.com/travel/destinations
Saved article link: https://edition.cnn.com/health/life-but-better/sleep
Saved article link: https://edition.cnn.com/world/photos
Saved article link: https://edition.cnn.com/world/africa/inside-africa
Saved article link: https://edition.cnn.com/style/design
Saved article link: https://edition.cnn.com/travel/stay
Saved article link: h

KeyboardInterrupt: 

### Text Extraction

In [None]:
import csv
import requests
import os
from bs4 import BeautifulSoup

# File paths
csv_file_path = 'Assignment1/Abhyudaya/Scripts and Data/cnn_article_links.csv'
visited_links_file = 'Assignment1/Abhyudaya/Scripts and Data/CNN_visited_links.txt'
txt_files_directory = 'Assignment1/Abhyudaya/Data/cnn'

# Ensure the directory exists
if not os.path.exists(txt_files_directory):
    os.makedirs(txt_files_directory)

# Read the links from the CSV file
def get_links(csv_file_path):
    with open(csv_file_path, newline='', encoding='utf-8') as file:
        return [row[0] for row in csv.reader(file) if row]

# Read the visited links from visited_links.txt
def get_visited_links(visited_links_file):
    if os.path.exists(visited_links_file):
        with open(visited_links_file, 'r', encoding='utf-8') as file:
            return set(line.strip() for line in file)
    return set()

# Write the visited link to visited_links.txt
def add_to_visited(link):
    with open(visited_links_file, 'a', encoding='utf-8') as file:
        file.write(link + '\n')

# Fetch content from the link and extract text using BeautifulSoup
def fetch_and_extract_text(link):
    try:
        response = requests.get(link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the div with class="article__content"
        content_div = soup.find('div', {'class': 'article__content'})
        if not content_div:
            print(f"No 'article__content' div found in {link}, skipping.")
            return None

        # Extract text from all <p> tags inside the div
        paragraphs = [p.get_text() for p in content_div.find_all('p')]
        if not paragraphs:
            print(f"No <p> tags found in 'article__content' div of {link}, skipping.")
            return None

        return '\n'.join(paragraphs)  # Combine all paragraphs into a single string
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {link}: {e}")
        return None

# Save content to a txt file in the specified directory
def save_to_file(content, file_number):
    file_path = os.path.join(txt_files_directory, f'cnn_{file_number}.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# The main logic to process the links
links = get_links(csv_file_path)
visited_links = get_visited_links(visited_links_file)

for index, link in enumerate(links):
    if link in visited_links:
        print(f"Skipping {link}, already visited.")
        continue

    content = fetch_and_extract_text(link)
    if content:
        save_to_file(content, index + 1)  # Save as cnn_1.txt, cnn_2.txt, etc.
        add_to_visited(link)  # Mark link as visited
        print(f"Saved content from {link} to {txt_files_directory}/cnn_{index + 1}.txt")
