In [37]:
!pip install yfinance
!pip install requests
!pip install requests beautifulsoup4 openai lxml
!pip install newspaper3k
!pip install lxml[html_clean]
!pip install lxml_html_clean


zsh:1: no matches found: lxml[html_clean]
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


Gather Stock Data

In [35]:
import yfinance as yf
import pandas as pd

# Function to fetch stock data
def fetch_stock_data(ticker, start_date, end_date):
    # Fetch stock data
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    # Return only the "Close" prices and the Date
    stock_data = stock_data[['Close']].reset_index()
    return stock_data

# Example: Fetch data for Apple (AAPL) from January 1, 2010 to March 1, 2025
ticker = 'AMZN'
start_date = '2010-01-01'
end_date = '2025-03-01'
stock_data = fetch_stock_data(ticker, start_date, end_date)

# Add year and stock price
stock_data['year'] = stock_data['Date'].dt.year
stock_data['stock_price'] = stock_data['Close']  # Renaming Close to stock_price

# Initialize all emotion values to 0
emotion_columns = ['optimism', 'anxiety', 'sadness', 'surprise', 'neutral', 'anger_disgust']
for col in emotion_columns:
    stock_data[col] = 0  # Set all emotions to 0 initially

# Resample to get only the first trading day of each month
stock_data_monthly = stock_data.resample('MS', on='Date').first()  # 'MS' stands for Month Start

# Reset the index to bring the 'Date' back as a column
stock_data_monthly = stock_data_monthly.reset_index()

# Rename 'Date' to 'date'
stock_data_monthly = stock_data_monthly.rename(columns={'Date': 'date'})

# Select the columns to match the required format
final_data = stock_data_monthly[['date', 'stock_price', 'year'] + emotion_columns]

# Dynamically name the CSV file based on the ticker symbol
csv_filename = f"{ticker}_graph.csv"
final_data.to_csv(csv_filename, index=False)

# Print the first few rows to verify the output
print(f"Saved to {csv_filename}")
print(final_data.head())


[*********************100%***********************]  1 of 1 completed

Saved to AMZN_graph.csv
Price        date stock_price  year optimism anxiety sadness surprise neutral  \
Ticker                                                                          
0      2010-01-01      6.6950  2010        0       0       0        0       0   
1      2010-02-01      5.9435  2010        0       0       0        0       0   
2      2010-03-01      6.2270  2010        0       0       0        0       0   
3      2010-04-01      6.5905  2010        0       0       0        0       0   
4      2010-05-01      6.8745  2010        0       0       0        0       0   

Price  anger_disgust  
Ticker                
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  



  stock_data_monthly = stock_data.resample('MS', on='Date').first()  # 'MS' stands for Month Start


Gathering Articles

In [42]:
import yfinance as yf
import requests
import time
import csv
from datetime import datetime
from newspaper import Article
from urllib.parse import quote_plus

# Set GNews API key
GNEWS_API_KEY = "3cef4efb93580128793bd14f13b1abf7"

# List of common suffixes to remove
SUFFIXES = [' Inc.', ' Ltd.', ' LLC', ' Corp.', ' Corporation', ' Co.', ' Group']

# List of domain-like suffixes to remove
DOMAIN_SUFFIXES = ['.com', '.org', '.net', '.co', '.edu']

# Function to get company name from ticker symbol using Yahoo Finance
def get_company_name_from_ticker(ticker):
    try:
        stock = yf.Ticker(ticker)
        company_name = stock.info['longName']
        
        # Remove common corporate suffixes from the company name
        for suffix in SUFFIXES:
            if company_name.endswith(suffix):
                company_name = company_name.replace(suffix, '').strip()

        # Remove domain-like suffixes (e.g., .com, .org, .net) anywhere in the company name
        for suffix in DOMAIN_SUFFIXES:
            company_name = company_name.lower().replace(suffix, '').strip()
        
        print(f"Processed Company Name: {company_name}")  # Debugging line to check the final name
        return company_name
    except KeyError:
        print(f"Error: Company name not found for ticker {ticker}. Using fallback name.")
        return None
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}. Using fallback name.")
        return None

# Function to fetch articles for a given year
def fetch_articles_for_year(api_key, company_name, year):
    start_date = f'{year}-01-01T00:00:00Z'
    end_date = f'{year}-12-31T23:59:59Z'
    query = quote_plus(f'"{company_name}"')  # URL encode the company name for exact match
    url = f'https://gnews.io/api/v4/search?q={query}&from={start_date}&to={end_date}&lang=en&max=10&token={api_key}&in=title,description'
    
    print(f"Requesting URL: {url}")  # Debugging line to print the URL
    for _ in range(3):  # Retry up to 3 times
        response = requests.get(url)
        print(f"Response status: {response.status_code}")  # Debugging line to print the response status
        if response.status_code == 200:
            return response.json().get('articles', [])
        else:
            print(f"Error fetching articles for {year} (Status {response.status_code}). Retrying...")
            time.sleep(2)
    return []

# Function to fetch articles across multiple years
def fetch_articles(api_key, company_name):
    current_year = datetime.now().year
    all_articles = []
    for year in range(2010, current_year + 1):
        print(f"Fetching articles for {year}...")
        articles = fetch_articles_for_year(api_key, company_name, year)
        if articles:
            all_articles.extend(articles)
        else:
            print(f"No articles found for {year}.")
        time.sleep(2)
    return all_articles

# Function to scrape the full content using Newspaper3k
def scrape_full_content(url):
    article = Article(url)
    for _ in range(3):  # Retry up to 3 times
        try:
            article.download()
            article.parse()
            return article.text
        except Exception as e:
            print(f"Error scraping {url}: {e}. Retrying...")
            time.sleep(2)
    return "Error fetching full content."

# Function to save articles to CSV with full content
def save_articles_to_csv_with_full_content(articles, filename):
    headers = ['Title', 'PublishedAt', 'FullContent', 'URL']
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        for article in articles:
            title = article['title']
            published_at = article['publishedAt']
            url = article['url']
            full_content = scrape_full_content(url)
            if full_content:
                writer.writerow([title, published_at, full_content, url])
    print(f"Articles saved to {filename}")

# Main function to get company name and save articles
def main():
    ticker_symbol = input("Enter the ticker symbol of the company: ").upper()
    
    # Get company name using Yahoo Finance API 
    company_name = get_company_name_from_ticker(ticker_symbol) 
    
    print(f"Fetching news articles for {company_name} ({ticker_symbol})...")
    
    articles_data = fetch_articles(GNEWS_API_KEY, company_name)
    
    if articles_data:
        # Save articles to CSV with the ticker symbol in the filename
        filename = f"{ticker_symbol}_news.csv"
        save_articles_to_csv_with_full_content(articles_data, filename)
        
        # Display the first 5 articles for confirmation
        for article in articles_data[:5]:
            print(f"Title: {article['title']}")
            print(f"PublishedAt: {article['publishedAt']}")
            print(f"URL: {article['url']}")
            print(f"Full Content: {scrape_full_content(article['url'])}\n")
    else:
        print(f"No articles found for {company_name}.")

if __name__ == "__main__":
    main()


Processed Company Name: amazon,
Fetching news articles for amazon, (AMZN)...
Fetching articles for 2010...
Requesting URL: https://gnews.io/api/v4/search?q=%22amazon%2C%22&from=2010-01-01T00:00:00Z&to=2010-12-31T23:59:59Z&lang=en&max=10&token=3cef4efb93580128793bd14f13b1abf7&in=title,description
Response status: 200
Fetching articles for 2011...
Requesting URL: https://gnews.io/api/v4/search?q=%22amazon%2C%22&from=2011-01-01T00:00:00Z&to=2011-12-31T23:59:59Z&lang=en&max=10&token=3cef4efb93580128793bd14f13b1abf7&in=title,description
Response status: 200
Fetching articles for 2012...
Requesting URL: https://gnews.io/api/v4/search?q=%22amazon%2C%22&from=2012-01-01T00:00:00Z&to=2012-12-31T23:59:59Z&lang=en&max=10&token=3cef4efb93580128793bd14f13b1abf7&in=title,description
Response status: 403
Error fetching articles for 2012 (Status 403). Retrying...
Response status: 403
Error fetching articles for 2012 (Status 403). Retrying...
Response status: 403
Error fetching articles for 2012 (Status

KeyboardInterrupt: 