## Webscraping Nairobi Wire: Extracting News Headlines and Summaries

Import beautifulsoup4 and requests

In [13]:
from bs4 import BeautifulSoup
import requests

Set `Nairobi Wire` website you want to scrape

In [14]:
url = 'https://nairobiwire.com/'

Use requests to get the HTML content of the website

In [15]:
response = requests.get(url)

In [16]:
# Check whether  request was successful, code 200 means success
requests.get(url)

<Response [200]>

Use beautifulsoup4 to parse the html content of the page

In [17]:
soup = BeautifulSoup(response.text, 'html.parser')

In [66]:
%%capture
print(soup.prettify());

Create a function to get articles and summaries from the website

In [None]:
def scrape_articles(soup):
    articles_data = []  # Initialize list to store articles

    # Find all articles in the soup
    articles = soup.find_all('article')

    for index, article in enumerate(articles, start=1):  # Enumerate the articles
        try:
            # Extract headline and summary
            headline_tag = article.find('h2')
            summary_tag = article.find(class_='excerpt56 component56')

            # Extract text from headline and summary tags
            headline = headline_tag.text.strip() if headline_tag else 'No headline available'
            summary = summary_tag.text.strip() if summary_tag else 'No summary available'
            
            # Append to the data list
            articles_data.append({
                'No.': index,
                'headline': headline,
                'summary': summary,
                'seperator':'--------------------------'
            })
        except Exception as e:
            # Log any processing errors
            print(f"Error processing article {index}: {e}")
    
    return articles_data  # Return the collected data


Print the returned data in a neat format

In [None]:
articles_data = scrape_articles(soup)

# Format and display the results in a neat way with numbering
for article in articles_data:
    print(f"Article No. {article['No.']}")
    print(f"Headline: {article['headline']}")
    print(f"Summary: {article['summary']}")
    print("-" * 40)  # Add separator line for clarity


Article No. 1
Headline: KCSE 2024 Ranking: Is Nyambaria High The Top School in Kenya..?
Summary: The Kenya National Examination Council (KNEC) has categorically denied allegations that it ranked Nyambaria High School or any other institution as the top performer in the 2024 Kenya Certificate of Secondary Education
----------------------------------------
Article No. 2
Headline: IMF Pushes Ruto To Regulate Crypto
Summary: No summary available
----------------------------------------
Article No. 3
Headline: Education Ministry Introduces Mid-Year KCSE Exams Starting This Year
Summary: No summary available
----------------------------------------
Article No. 4
Headline: Kenyan Court Decriminalizes Attempted Suicide, Protects Mental Health Rights
Summary: No summary available
----------------------------------------
Article No. 5
Headline: Morara Kebaso Responds to KRA’s Tax Probe, Calls It a Political Witch-Hunt
Summary: No summary available
----------------------------------------
Article

Create a function to save scraped data into a text file

In [None]:
def save_as_txt(articles_data):
    with open('scraped_data.txt', 'w') as file:
        for article in articles_data:
            file.write(f"Article No. {article['No.']}\n")
            file.write(f"Headline: {article['headline']}\n")
            file.write(f"Summary: {article['summary']}\n")
            file.write("-" * 40 + "\n")


In [62]:
save_as_txt(articles_data)  # Save the scraped data to a text file