In [28]:
!pip install yfinance
!pip install requests
!pip install requests beautifulsoup4 openai lxml
!pip install lxml





Gather Stock Data

In [22]:
import yfinance as yf
import pandas as pd
import numpy as np

# Function to fetch stock data
def fetch_stock_data(ticker, start_date, end_date):
    # Fetch stock data
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    # Return only the "Close" prices and the Date
    stock_data = stock_data[['Close']].reset_index()
    return stock_data

# Example: Fetch data for Apple (AAPL) from January 1, 2010 to March 1, 2025
ticker = 'AAPL'
start_date = '2010-01-01'
end_date = '2025-03-01'
stock_data = fetch_stock_data(ticker, start_date, end_date)

# Placeholder emotional tone data (assuming a simple range from 0 to 10)
# Simulate some values, in real case you would replace them with actual analysis
np.random.seed(42)  # For reproducibility
emotion_data = {
    'optimism': np.random.uniform(5, 7, len(stock_data)),
    'anxiety': np.random.uniform(2, 5, len(stock_data)),
    'sadness': np.random.uniform(1, 4, len(stock_data)),
    'surprise': np.random.uniform(3, 6, len(stock_data)),
    'neutral': np.random.uniform(4, 6, len(stock_data)),
    'anger_disgust': np.random.uniform(2, 5, len(stock_data))
}

# Add year, stock price and emotion data to the dataframe
stock_data['year'] = stock_data['Date'].dt.year
stock_data['optimism'] = emotion_data['optimism']
stock_data['anxiety'] = emotion_data['anxiety']
stock_data['sadness'] = emotion_data['sadness']
stock_data['surprise'] = emotion_data['surprise']
stock_data['neutral'] = emotion_data['neutral']
stock_data['anger_disgust'] = emotion_data['anger_disgust']
stock_data['stock_price'] = stock_data['Close']  # Renaming Close to stock_price

# Resample to get only the first trading day of each month
stock_data_monthly = stock_data.resample('MS', on='Date').first()  # 'MS' stands for Month Start

# Reset the index to bring the 'Date' back as a column
stock_data_monthly = stock_data_monthly.reset_index()

# Rename 'Date' to 'date'
stock_data_monthly = stock_data_monthly.rename(columns={'Date': 'date'})

# Select the columns to match the required format
final_data = stock_data_monthly[['date', 'stock_price', 'year', 'optimism', 'anxiety', 'sadness', 'surprise', 'neutral', 'anger_disgust']]

# Save to a new CSV file (this will create a new file every time the script is run)
final_data.to_csv('stock_data_with_emotions_monthly.csv', index=False)

# Print the first few rows to verify the output
print(final_data.head())


[*********************100%***********************]  1 of 1 completed

Price        date stock_price  year  optimism   anxiety   sadness  surprise  \
Ticker                                                                        
0      2010-01-01    6.440331  2010  5.749080  3.841911  3.669062  5.357401   
1      2010-02-01    5.860126  2010  5.582458  4.470871  3.067430  5.715264   
2      2010-03-01    6.289261  2010  6.368466  2.349075  2.072540  5.178318   
3      2010-04-01    7.101188  2010  5.542698  2.125746  1.935110  3.925065   
4      2010-05-01    8.015430  2010  5.661796  4.473415  2.612138  3.053635   

Price    neutral anger_disgust  
Ticker                          
0       4.393373      2.564841  
1       5.216639      4.771540  
2       4.697012      3.025596  
3       4.024832      4.787747  
4       5.050352      4.670469  



  stock_data_monthly = stock_data.resample('MS', on='Date').first()  # 'MS' stands for Month Start


Gathering Articles

In [30]:
import requests
import time
import csv
from datetime import datetime
from bs4 import BeautifulSoup
import openai

# Set OpenAI API key and GNews API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GNEWS_API_KEY = os.getenv("NEWS_API_KEY")

# Function to fetch articles for a given year
def fetch_articles_for_year(api_key, company_name, year):
    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'
    url = f'https://gnews.io/api/v4/search?q={company_name}&from={start_date}&to={end_date}&lang=en&max=10&token={api_key}'
    
    for _ in range(3):  # Retry up to 3 times
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('articles', [])
        else:
            print(f"Error fetching articles for {year} (Status {response.status_code}). Retrying...")
            time.sleep(2)
    return []

# Function to fetch articles across multiple years
def fetch_articles(api_key, company_name):
    current_year = datetime.now().year
    all_articles = []
    for year in range(2024, current_year + 1):
        print(f"Fetching articles for {year}...")
        articles = fetch_articles_for_year(api_key, company_name, year)
        if articles:
            all_articles.extend(articles)
        else:
            print(f"No articles found for {year}.")
        time.sleep(2)
    return all_articles

# Function to scrape the article content (fallback)
def scrape_full_content_requests(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Remove unwanted elements such as advertisements, sidebars, footers
        for ad_tag in soup.find_all(["aside", "footer", "header", "nav", "iframe", "script", "style"]):
            ad_tag.decompose()
        
        text_content = soup.get_text(separator="\n", strip=True)
        return text_content[:3000] if text_content else "Full content not found"
    except requests.exceptions.RequestException:
        return "Requests failed"

# GPT-4 extraction for articles with filtering for ads and unwanted text
def fetch_article_with_gpt(url):
    prompt = f"Extract only the relevant article content from the following news article URL, removing advertisements, unrelated content, or any irrelevant text. URL: {url}" 
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4", 
            messages=[{"role": "system", "content": "You are a content extraction assistant. Your task is to extract only the article content from a web page, excluding ads or any unrelated information."},
                      {"role": "user", "content": prompt}],
            max_tokens=2000
        )
        return response["choices"][0]["message"]["content"].strip()
    except Exception as e:
        return f"GPT extraction failed: {str(e)}"

# Main function to scrape full content
def scrape_full_content(url):
    content = scrape_full_content_requests(url)
    if "Full content not found" in content or "Requests failed" in content:
        print(f"Trying GPT-4 extraction for {url}...")
        content = fetch_article_with_gpt(url)
    return content

# Function to save articles to CSV with full content
def save_articles_to_csv_with_full_content(articles, filename='articles_with_full_content.csv'):
    headers = ['Title', 'PublishedAt', 'FullContent', 'URL']
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        for article in articles:
            title = article['title']
            published_at = article['publishedAt']
            url = article['url']
            full_content = scrape_full_content(url)
            if full_content != "Full content not found" and full_content != "Requests failed":
                writer.writerow([title, published_at, full_content, url])
    print(f"Articles saved to {filename}")

# Example usage
company_name = 'Tesla'
articles_data = fetch_articles(GNEWS_API_KEY, company_name)
save_articles_to_csv_with_full_content(articles_data)

# Display the first 5 articles for confirmation
for article in articles_data[:5]:
    print(f"Title: {article['title']}")
    print(f"PublishedAt: {article['publishedAt']}")
    print(f"URL: {article['url']}")
    print(f"Full Content: {scrape_full_content(article['url'])}\n")


Fetching articles for 2024...
Fetching articles for 2025...
Articles saved to articles_with_full_content.csv
Title: Dealers ‘stiffed’ by Tesla’s rush to claim last EV rebates
PublishedAt: 2025-03-07T10:00:00Z
URL: https://www.thestar.com/news/canada/tesla-gamed-the-system-canadian-auto-dealers-stiffed-millions-when-u-s-giant-rushed-to/article_6d1025c6-fa0a-11ef-b780-a73277202cb2.html
Full Content: Dealers ‘stiffed’ by Tesla’s rush to claim last EV rebates
Skip to main content
You are the owner of this article.
Edit Article
Add New Article
Close
You have permission to edit this article.
Edit
Close
Close
Home
News
Canada
Oakville auto dealer Terry Budd says he is out more than $150,000 after Tesla jumped to snap up the last of the iZEV EV rebates.
Michelle Mengsu Chang/
Toronto Star
By
Marco Chown Oved
Climate Change Reporter
More than 200 auto dealers across the country were “stiffed” when Tesla “had a run on the bank,” claiming tens of millions of dollars in EV rebates on the last week