<a id='Q0'></a>
<center> <h1> Aviation Herald Project: Update the Dataset</h1> </center>
<p style="margin-bottom:1cm;"></p>
<center><h4>Laurent Bobay, 2024</h4></center>
<p style="margin-bottom:1cm;"></p>

<div style="background:#EEEDF5;border-top:0.1cm solid #EF475B;border-bottom:0.1cm solid #EF475B;">
    <div style="margin-left: 0.5cm;margin-top: 0.5cm;margin-bottom: 0.5cm;color:#303030">
        <p><strong>Goal:</strong> Create dataset of all publicly available articles and comments from www.avherald.com</p>
        <strong> Outline:</strong>
        <a id='P0' name="P0"></a>
        <ol>
            <li> <a style="color:#303030" href='#SU'>Set up</a></li>
            <li> <a style="color:#303030" href='#P1'>Data Exploration and Cleaning</a></li>
            <li> <a style="color:#303030" href='#P2'>Modeling</a></li>
            <li> <a style="color:#303030" href='#P3'>Model Evaluation</a></li>
            <li> <a style="color:#303030" href='#CL'>Conclusion</a></li>
        </ol>
        <strong>Topics Trained:</strong> Notebook Layout, Data Cleaning, Modelling and Model Evaluation
    </div>
</div>

<nav style="text-align:right"><strong>
        <a style="color:#00BAE5" href="https://monolith.propulsion-home.ch/backend/api/momentum/materials/ds-materials/07_MLEngineering/index.html" title="momentum"> Module 7, Machine Learning Engineering </a>|
        <a style="color:#00BAE5" href="https://monolith.propulsion-home.ch/backend/api/momentum/materials/ds-materials/07_MLEngineering/day1/index.html" title="momentum">Day 1, Data Science Project Development </a>|
        <a style="color:#00BAE5" href="https://drive.google.com/file/d/1SOCQu9Gv3jNNXxvJSszBC3fYNsM0df2F/view?usp=sharing" title="momentum"> Live Coding 1, Simple Prediction Notebook</a>
</strong></nav>

In [22]:

import pandas as pd
import re
from tqdm import tqdm

from scraping_helpers import *
from preprocessing_helpers import *

In [24]:
# # File path for the local storeage
# filepath = "../data/processed/av_dataset.csv"
# df = pd.read_csv(filepath)



# df0 = df.iloc[:6000]
# df1 = df.iloc[6000:18000]
# df2 = df.iloc[18000:]

# # Write them to file
# filepath0 = "../data/processed/av_dataset0.csv"
# filepath1 = "../data/processed/av_dataset1.csv"
# filepath2 = "../data/processed/av_dataset2.csv"
# df0.to_csv(filepath0, sep=',', index=False, header=True, na_rep='NULL', encoding='utf-8')
# df1.to_csv(filepath1, sep=',', index=False, header=True, na_rep='NULL', encoding='utf-8')
# df2.to_csv(filepath2, sep=',', index=False, header=True, na_rep='NULL', encoding='utf-8')


In [11]:
def update(filepath):
    """ The function takes the filepath to the full dataset and then updates that dataset by scraping the new entries on the webiste"""

    # The dataset
    filepath = "../data/processed/av_dataset.csv"
    
    # Read in the current (non-up-to-date) datset
    old_df = pd.read_csv(filepath)
    print(f"Length of previous df: {len(old_df)}")

    # From the dataset read the href of the first row
    last_href = old_df.iloc[0].href


    """ Scrape missing items """

    # Entry
    URL = "https://avherald.com"

    # Scrape hrefs and titles
    titles, hrefs = tqdm(get_new_titles_and_hrefs(URL, last_href), desc="Loading hrefs") # titles is a dict {title: href}

    # Scrape titles, articles, comments unitl that href
    texts = []
    time_authors = []
    headlines = []
    comment_authors = []
    comments = []
    occurrences = []
    urls = []

    for href in tqdm(hrefs, desc="Updating Dataset"):
        url = URL + href
        page = load_page(url)
        article_text, time_author = get_article(page)
        headline_text, comment_authors_texts, comments_texts = get_comments(page)
        occurrence = find_occurrence_type(headline_text)

        texts.append(article_text)
        time_authors.append(time_author)
        headlines.append(headline_text)
        comment_authors.append(comment_authors_texts)
        comments.append(comments_texts)
        occurrences.append(occurrence)
        urls.append(url)

    df = pd.DataFrame({
        "title": titles,
        "href": hrefs,
        "text": texts,
        "time_author": time_authors,
        "headline": headlines,
        "comment_authors": comment_authors,
        "comments": comments,
        "occurrence": occurrences,
        "url": urls
    })

    

    """ If there are no updates, return the old_df"""
    if len(df) == 0:
        print("Nothing to update")
        return old_df
    else:
        # Print the number of new articles
        print(f"{len(df)} new articles found")



    """Preprocess the update_df"""

    # Apply the function to each value in 'Input' column
    df["author"], df["created"], df["updated"] = zip(*df["time_author"].apply(get_author_and_time))

    # Make sure all items in 'text' column are string
    df['text'] = df['text'].apply(lambda x: str(x) if pd.notna(x) else " ")

    # Remove all linebrakes, tabs, etc. in the texts
    def remove_linebreaks(text):
        return re.sub(r'[\n\r\t\s]+', ' ', text, flags=re.UNICODE)
    df["text"] = df["text"].apply(remove_linebreaks)

    # Ensure necessary NLTK resources are downloaded
    nltk.download('punkt');
    stop_words = nltk.corpus.stopwords.words('english')

    # Initialize geonamescache
    gc = geonamescache.GeonamesCache()

    # Get a dictionary of cities and countries
    cities = gc.get_cities()
    countries = gc.get_countries()

    # Extract city and country names
    city_names = [remove_accented_chars(city['name']).lower() for city in cities.values()]
    country_names = [remove_accented_chars(country['name']).lower() for country in countries.values()]

    # Apply the preprocess_text function to each row in df["text"] with tqdm progress bar
    tqdm.pandas(desc="Normalizing texts")  # This line enables tqdm support for Pandas apply function
    df["normalized_text"], df["cities"], df["countries"] = zip(*df["text"].progress_apply(lambda x: preprocess_text(x, stop_words, city_names, country_names)))

    # Drop rows where there is no text
    df = df[df["text"].notna()]

    # Assign the flight phase to each new row
    df["flight_phase"] = df["text"].apply(assign_flight_phase)

    # Concatenate the update_df and the old_df:
    df_new = pd.concat([df, old_df], ignore_index=True)

    # Write the DataFrame to a CSV file with additional options
    df_new.to_csv(filepath, sep=',', index=False, header=True, na_rep='NULL', encoding='utf-8')

    """ Done with updating """
    print("Update complete")

    return df_new


In [12]:
df = update(filepath)
df.head(3)

Length of previous df: 29025


Loading hrefs: 100%|██████████| 2/2 [00:00<00:00, 26296.58it/s]
Updating Dataset: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]
[nltk_data] Downloading package punkt to /Users/laurent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


3 new articles found


Normalizing texts: 100%|██████████| 3/3 [00:00<00:00, 20.99it/s]


Update complete


<div style="border-top:0.1cm solid #EF475B"></div>
    <strong><a href='#Q0'><div style="text-align: right"> <h3>End of this Notebook.</h3></div></a></strong>