# Web-Scraping www.avherald.com

In [67]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import csv
import time

## Define scraping functions:

In [91]:
def load_page(url):
    """ Returns the response for the URL"""
    ### Load the first page:
    page = requests.get(url, headers=headers, timeout=5)

    return page

def get_next_page_href(page):
    """On a given webpage, finds and returns the URL for the 'next' page"""

    # Convert page to soup object
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the URL for the next page
    # Find the <img> tag with the specified src attribute
    next_tag = soup.find('img', {'src': '/images/next.jpg'})

    if next_tag:
        # Find the parent <a> tag
        parent_a_tag = next_tag.find_parent('a')
        
        if parent_a_tag:
            # Get the href attribute of the parent <a> tag
            next_page_url = parent_a_tag.get('href')
        else:
            next_page_url = None
            print("Parent <a> tag not found.")
    else:
        next_page_url = None
        print("Image with the specified src attribute not found.")

    return next_page_url

def get_article_titles_and_hrefs(page):
    """ For a given page, returns all article hrefs as a list"""
    # Convert to soup object
    soup = BeautifulSoup(page.content, "html.parser")

    # Locate all the links and store them in the list 'hrefs'

    # Find all <span> elements with the class 'headline_avherald'
    spans = soup.find_all('span', {'class': 'headline_avherald'})

    # Print or process the found spans
    article_hrefs = []
    article_titles = []
    for span in spans:
        # Get the title
        article_title = span.get_text()
        article_titles.append(article_title)
        # Get the parent <a> tag
        parent_a_tag = span.find_parent('a')
        if parent_a_tag:
            # Get the href
            href = parent_a_tag.get('href')
            article_hrefs.append(href)

    return article_titles, article_hrefs

def get_all_titles_and_hrefs(url):
    # Create empty lists for titles and hrefs:
    article_titles = []
    article_hrefs = []

    # Load home page
    page = load_page(url)

    # Get article titles and hrefs from the first page (2 separate lists)
    titles, articles = get_article_titles_and_hrefs(page)
    article_titles += titles
    article_hrefs += articles
    # Get next page URL
    next_page_href = get_next_page_href(page)

    # While next_page_URL:
    counter = 0
    #while counter < number_of_pages:
    while next_page_href:
        print(f'Page: {counter}', end='\r', flush=True)
        url = "https://avherald.com" + next_page_href
        page = load_page(url)

        titles, articles = get_article_titles_and_hrefs(page)
        article_titles += titles
        article_hrefs += articles
        
        next_page_href = get_next_page_href(page)

        counter += 1

    title_dict = dict(zip(article_titles, article_hrefs))
    
    return title_dict

def write_titles(title_dict):
    # Specify the file path
    file_path = 'titles.csv'

    # Open the file in write mode ('w', newline='') to ensure newline character handling
    with open(file_path, 'w', newline='') as f:
        # Define the fieldnames for the CSV header
        fieldnames = ['title', 'href']

        # Create a CSV writer object
        writer = csv.DictWriter(f, fieldnames=fieldnames)

        # Write the header
        writer.writeheader()

        # Iterate over dictionary items and write them as rows
        for key, value in title_dict.items():
            writer.writerow({'title': key, 'href': value})

def get_article(page):
    """ For a given article page, returns the article text and the timestamp (including author)"""
    # Convert to soup object
    soup = BeautifulSoup(page.content, "html.parser")

    # Locate all the links and store them in the list 'hrefs'

    # Find the <span> element with the class 'sitetext'
    texts = soup.find_all('span', {'class': 'sitetext'})
    if len(texts) > 3:
        article_text = texts[3].get_text()
    else:
        article_text = "xxx"

    time_author = soup.find('span', {'class': 'time_avherald'})
    if time_author is not None:
        time_author = time_author.get_text()
    else:
        time_author = "yyy"

    return article_text, time_author

def get_comments(page):
    """ For a given article page, returns the title and the comments"""
        # Convert to soup object
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the <span> element with the class 'headline_article', which is the article headline containing the event date
    headline = soup.find('span', {'class': 'headline_article'})
    if headline:
        headline_text = headline.get_text()
    else:
        headline_text = None

    # Finde comments and comment authors
    comment_authors = soup.find_all('span', {'class': 'time_avherald'})
    comments = soup.find_all('span', {'class': 'sitecomment'})

    comment_authors_texts = []
    comments_texts = []

    for comment_author in comment_authors[1:]: # The class 'time_avherald' is also used just below the headline, so we omit that one
        comment_authors_texts.append(comment_author.get_text())
    for comment in comments[:-1]:
        comments_texts.append(comment.get_text())

    return headline_text, comment_authors_texts, comments_texts


## Scrape the article hrefs

In [None]:
# # Entry
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
# URL = "https://avherald.com"

# # Scrape titles and hrefs to articles
# titles = get_all_titles_and_hrefs(URL)
# print(len(titles))
# write_titles(titles)
# print("done!")

Image with the specified src attribute not found.
28963
done!


## Scrape the articles

In [78]:
# # Scrape articles and write them to a file
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
# URL = "https://avherald.com"
# LAST_ENTRY = 2810

# def scrape_articles(title_file):
    # """ From the titles.csv, line by line opens the page, gets the text and the timestamp and appends it to 'articles.csv' """

    # # Set a counter to keep track
    # counter = 0

    # # Open title.csv
    # input_file_path = title_file
    # output_file_path = "articles.csv"
    # fieldnames = ['title', 'href', 'text', 'time_author']

    # # Open the CSV file
    # with open(input_file_path, mode='r', newline='') as input_file:
    #     # Open output file for writing ('w' mode)
    #     with open(output_file_path, 'a', newline='', encoding='utf-8') as output_file:
    #         reader = csv.DictReader(input_file)
    #         writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    #         # # Write header row
    #         # writer.writeheader()
        
    #         # Iterate over each row in the CSV file
    #         for row in reader:
    #             # Increment counter
    #             counter += 1
    #             print(f'article: {counter}', end='\r', flush=True)

    #             # # Break for testing:
    #             # if counter == 800:
    #             #     break

    #             if counter > LAST_ENTRY:
    #                 # Access the 'href' column in each row
    #                 href_value = str(row["href"])
    #                 title = row["title"]

    #                 # Create the page url
    #                 url = URL + href_value

    #                 # Load page:
    #                 page = load_page(url)
    #                 text, time_author = get_article(page)

    #                 # Write to output file title, href, text, time
    #                 row = {"title": title, "href": href_value, "text": text, "time_author": time_author}
    #                 writer.writerow(row)

    #                 # Add a time delay of 1s:
    #                 time.sleep(1)

                


    # print("done!")


In [79]:
#scrape_articles("titles.csv")


done!le: 28963


## Scrape Comments

In [124]:
# Scrape headline, comments with authors and write them to a file
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
URL = "https://avherald.com"
LAST_ENTRY = 0

def scrape_comments(title_file):
    """ From the titles.csv, line by line opens the page, gets the text and the timestamp and appends it to 'articles.csv' """

    # Set a counter to keep track
    counter = 1

    # Open title.csv
    input_file_path = title_file
    output_file_path = "comments.csv"
    fieldnames = ['headline', 'href', 'comment_authors', 'comments']

    # Open the CSV file
    with open(input_file_path, mode='r', newline='') as input_file:
        # Open output file for writing ('w' mode)
        with open(output_file_path, 'a', newline='', encoding='utf-8') as output_file:
            reader = csv.DictReader(input_file)
            writer = csv.DictWriter(output_file, fieldnames=fieldnames)
            # # Write header row
            # writer.writeheader()
        
            # Iterate over each row in the CSV file
            for row in reader:
                # Increment counter
                counter += 1
                print(f'article: {counter}', end='\r', flush=True)

                # # Break for testing:
                # if counter == 10:
                #     break

                if counter > LAST_ENTRY:
                    # Access the 'href' column in each row
                    href_value = str(row["href"])

                    # Create the page url
                    url = URL + href_value

                    # Load page:
                    page = load_page(url)
                    headline, comment_authors, comments = get_comments(page)

                    # Write to output file title, href, text, time
                    row = {"headline": headline, "href": href_value, "comment_authors": comment_authors, "comments": comments}
                    writer.writerow(row)

                    # Add a time delay of 1s:
                    time.sleep(1)

In [125]:
#scrape_comments("titles.csv")
scrape_comments("missing_href.csv")

article: 12

In [12]:
### TODO: Write an update function


## Create the dataframe

# Analysis of the Data

## 1. How many jobs are shared between these categories?

## 2. How much do the keywords “Data Analyst” and “Big Data Analyst” overlap?

## 3. Are there some companies doing more hires than average?

## 4. How many jobs are there in different cantons?

## 5. Is “machine learning” keyword more often in data scientist or data analyst jobs?


## 6. What is the distribution of most common keywords between and across categories?