In [17]:
from pygooglenews import GoogleNews
import pandas as pd
import datetime
import time
import random

In [18]:
def get_titles(keyword, start, end):
    news = []
    gn = GoogleNews(lang='en',country='US')
    search = gn.search(keyword, from_=start, to_=end,)
    articles = search['entries']
    for i in articles:
        article = {'title': i.title, 'published': i.published, 'link': i.link}
        news.append(article)
    return news

In [19]:
def get_news_for_range(keyword, start_date, end_date):
    all_news = []
    
    current_date = start_date
    while current_date <= end_date:
        next_date = current_date + datetime.timedelta(days=1)
        
        # Format the dates as required for the API
        start_str = current_date.strftime('%Y-%m-%d')
        end_str = next_date.strftime('%Y-%m-%d')

        # Fetch the articles for that day
        daily_news = get_titles(keyword, start_str, end_str)
        print(f"Now checking for articles on {start_str}. There are {len(daily_news)} articles.")
        
        # If there are 5 or fewer articles, use all of them
        if len(daily_news) <= 5:
            selected_articles = daily_news
        else:
            # Randomly select 5 articles
            selected_articles = random.sample(daily_news, 5)
        
        # Get the titles for the selected articles and join them into one string
        top_titles = [article['title'] for article in selected_articles]  # Get the titles
        daily_entry = {
            'date': current_date.strftime('%Y-%m-%d'),
            'top_5_titles': " | ".join(top_titles)  # Join the titles with a separator
        }
        print(daily_entry)
        all_news.append(daily_entry)

        # Sleep to prevent hitting the rate limit (adjust as necessary)
        time.sleep(1)  # Sleep for 1 second between requests

        # Move to the next day
        current_date = next_date

    return all_news


In [20]:
# Define the start and end dates
start_date = datetime.date(2019, 11, 10)
end_date = datetime.date(2024, 11, 10)

# Fetch all news for the range
data = get_news_for_range("JNJ", start_date, end_date)

Now checking for articles on 2019-11-10. There are 0 articles.
{'date': '2019-11-10', 'top_5_titles': ''}


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(data)
df.head()

print(len(df))

In [None]:
df.to_csv('../data/JNJ_NEWS.csv')

In [None]:
# Define the start and end dates
start_date = datetime.date(2019, 11, 10)
end_date = datetime.date(2024, 11, 10)

# Fetch all news for the range
data = get_news_for_range("AAPL", start_date, end_date)

In [None]:
df = pd.DataFrame(data)
df.head()

print(len(df))

In [None]:
df.to_csv('../data/AAPL_NEWS.csv')

In [33]:
def remove_repeated_titles(df):
    # We will iterate through each row, and for each row, we will check the next row's titles
    for i in range(len(df) - 1):  # Skip the last row as it doesn't have a next row
        # Ensure current and next row titles are treated as strings and clean up extra spaces
        current_titles = str(df.loc[i, 'top_5_titles']).strip()  # Convert to string and strip spaces
        next_titles = str(df.loc[i + 1, 'top_5_titles']).strip()  # Convert to string and strip spaces

        # Skip if either current_titles or next_titles are empty (from NaN or malformed data)
        if not current_titles or not next_titles:
            continue

        # Split the titles by the pipe character ("|") and remove any extra spaces around each title
        current_titles_list = [title.strip() for title in current_titles.split('|')]
        next_titles_list = [title.strip() for title in next_titles.split('|')]

        # Convert the lists to sets to remove duplicates within the same list
        current_titles_set = set(current_titles_list)
        next_titles_set = set(next_titles_list)

        # Remove titles from next_titles that are in current_titles
        updated_next_titles = [title for title in next_titles_list if title not in current_titles_set]

        # Join the updated next titles back into a single string with '|' separator
        df.loc[i + 1, 'top_5_titles'] = '|'.join(updated_next_titles)

    return df

In [35]:
file_path = '../data/AAPL_NEWS.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Ensure the columns are correctly named (you can change 'date' and 'titles' based on your CSV structure)
# The column names should correspond to the CSV file, e.g., 'date' for dates and 'titles' for the article list.
# Adjust this line if necessary

# Clean up repeated titles
df_cleaned = remove_repeated_titles(df)

# Optionally, save the cleaned data to a new CSV file
df_cleaned.to_csv('../data/cleaned_AAPL_NEWS.csv', index=False)

In [36]:
file_path = '../data/JNJ_NEWS.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Ensure the columns are correctly named (you can change 'date' and 'titles' based on your CSV structure)
# The column names should correspond to the CSV file, e.g., 'date' for dates and 'titles' for the article list.
# Adjust this line if necessary

# Clean up repeated titles
df_cleaned = remove_repeated_titles(df)

# Optionally, save the cleaned data to a new CSV file
df_cleaned.to_csv('../data/cleaned_JNJ_NEWS.csv', index=False)