In [1]:
!pip install beautifulsoup4 requests

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.2-py3-none-any.whl (106 kB)
Downloading soupsieve-2.8-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4

   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   ---------------------------------------- 2/2 [beautifulsoup4]

Successfully installed beautifulsoup4-4.14.2 soupsieve-2.8


In [None]:
# Install necessary libraries (if not already installed)
!pip install beautifulsoup4 requests pandas -q
!pip install openpyxl -q

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Read URLs from an Excel file (replace 'urls.xlsx' with your file path)
excel_path = 'data/artikel_damri.xlsx'  
df_urls = pd.read_excel(excel_path, engine='openpyxl')

# If the sheet has a column named 'url' use it, otherwise use the first column
if 'link' in [c.lower() for c in df_urls.columns]:
    # find the actual column name matching 'url' (case-insensitive)
    url_col = next(c for c in df_urls.columns if c.lower() == 'link')
    urls = df_urls[url_col].dropna().astype(str).tolist()
else:
    urls = df_urls.iloc[:, 0].dropna().astype(str).tolist()

# Function to scrape content from a URL
def scrape_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the title of the article
            title = soup.find('title').text

            # Extract the article's content
            paragraphs = soup.find_all('p')
            content = "\n".join([para.text for para in paragraphs])

            return {
                "url": url,
                "title": title,
                "content": content
            }
        else:
            return {
                "url": url,
                "title": None,
                "content": None,
                "error": f"Failed to fetch page, status code: {response.status_code}"
            }
    except Exception as e:
        return {
            "url": url,
            "title": None,
            "content": None,
            "error": str(e)
        }

# Scrape each URL and store the results in a list of dictionaries
data = []
for url in urls:
    result = scrape_content(url)
    data.append(result)

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Display the DataFrame
df.head()

# Save the DataFrame to a CSV file (optional)
df.to_csv('data/scraped_articles.csv', index=False)