In [16]:
import yfinance as yf
import pandas as pd
import time
from alpha_vantage.timeseries import TimeSeries
from datetime import datetime
import os
import requests
from bs4 import BeautifulSoup
from sec_edgar_downloader import Downloader
from sec_api import QueryApi, SecLitigationsApi, AaerApi
from urllib.parse import urljoin
from datasets import load_dataset

import ssl
import urllib3
import pytz

### Chosen Stocks
1. GameStop (GME): Meme stock with documented pump-and-dump-like behavior in 2021.
2. AMC Entertainment (AMC): Another meme stock with high volatility and social media influence. In same year 2021, short squeeze triggered by Reddit communities.
3. Tilray Brands (TLRY): Cannabis stock, often volatile and targeted in pump-and-dump schemes.
4. Sundial Growers (SNDL): Penny stock in the cannabis sector, prone to manipulation.
5. Naked Brand (NAKD): Penny stock with historical pump-and-dump activity.
6. Zomedica (ZOM): Biotech penny stock, often hyped on social media.
7. Apple (AAPL): Blue-chip stock for insider trading analysis.
8. Tesla (TSLA): Large-cap stock with high news coverage and volatility.
9. Canoo (GOEV): Small-cap electric vehicle stock, volatile and prone to hype.
10. Plug Power (PLUG): Small-cap energy stock, often targeted in manipulation schemes.

In [2]:
stocks = ['GME', 'AMC', 'TLRY', 'SNDL', 'NAKD', 'ZOM', 'AAPL', 'TSLA', 'GOEV', 'PLUG']
# stocks = ['SNDL']
form_types = ['10-K', '10-Q', '8-K']
start_date = '2018-01-01'
end_date = '2023-12-31'
intraday_stocks = ['GME', 'AMC', 'SNDL']  # Subset for intraday data
intraday_year = 2021  # Focusing on 2021 for intraday due to API limits
alpha_vantage_api_key = 'JMLBULM9O2APELTQ' 
sec_api_key = '72e3942a367fbcd428ad3a4cd4b2c0af7ebedf3e2d3097224b5cae926f0ac003'
# output_dir = 'stock_data'
# os.makedirs(output_dir, exist_ok=True)
# output_dir2 = 'sec_finra_data'
# os.makedirs(output_dir2, exist_ok=True)
# os.makedirs(f"{output_dir2}/sec_filings", exist_ok=True)
# os.makedirs(f"{output_dir2}/sec_enforcement", exist_ok=True)
# os.makedirs(f"{output_dir2}/finra_actions", exist_ok=True)

In [None]:
# Collecting daily stock data using yfinance
def collect_yfinance_data():
    for stock in stocks:
        try:
            print(f"Downloading daily data for {stock}...")
            ticker = yf.Ticker(stock)
            df = ticker.history(start=start_date, end=end_date, interval='1d')
            if not df.empty:
                df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
                df.to_csv(f"{output_dir}/{stock}_daily.csv")
                print(f"Saved {stock}_daily.csv")
            else:
                print(f"No data for {stock}")
        except Exception as e:
            print(f"Error downloading {stock}: {e}")
        time.sleep(1)

collect_yfinance_data()

Downloading daily data for GME...
Saved GME_daily.csv
Downloading daily data for AMC...
Saved AMC_daily.csv
Downloading daily data for TLRY...
Saved TLRY_daily.csv
Downloading daily data for SNDL...
Saved SNDL_daily.csv
Downloading daily data for NAKD...


$NAKD: possibly delisted; no timezone found


No data for NAKD
Downloading daily data for ZOM...
Saved ZOM_daily.csv
Downloading daily data for AAPL...
Saved AAPL_daily.csv
Downloading daily data for TSLA...
Saved TSLA_daily.csv
Downloading daily data for GOEV...
Saved GOEV_daily.csv
Downloading daily data for PLUG...
Saved PLUG_daily.csv


In [22]:
# Initializing SEC downloader
dl = Downloader("UOG","anshulaggarwal2666@gmail.com", f"{output_dir2}/sec_filings")

# Collecting SEC filings (10-K, 10-Q, 8-K)
def collect_sec_filings():
    for stock in stocks:
        for form_type in form_types:
            try:
                print(f"Downloading {form_type} filings for {stock}...")
                dl.get(form_type, stock, after=start_date, before=end_date)
                print(f"Saved {form_type} filings for {stock}")
            except Exception as e:
                print(f"Error downloading {form_type} for {stock}: {e}")
            time.sleep(0.1)

collect_sec_filings()

Downloading 10-K filings for SNDL...
Saved 10-K filings for SNDL
Downloading 10-Q filings for SNDL...
Saved 10-Q filings for SNDL
Downloading 8-K filings for SNDL...
Saved 8-K filings for SNDL


In [None]:
def fetch_huggingface_twitter_data(stocks, start_date, end_date, output_dir):
    """
    Load historical Twitter data from Hugging Face and filter for specified stocks.
    
    Args:
        stocks (list): List of stock tickers (e.g., ['GME', 'AMC']).
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
        output_dir (str): Directory to save output CSV.
    """
    # Convert dates to UTC-aware datetime
    start_dt = datetime.strptime(start_date, '%Y-%m-%d').replace(tzinfo=pytz.UTC)
    end_dt = datetime.strptime(end_date, '%Y-%m-%d').replace(tzinfo=pytz.UTC)
    
    dataset_name = "StephanAkkerman/stock-market-tweets-data"

    df_filtered = pd.DataFrame()
    
    # for dataset_name in datasets_to_try:
    print(f"Attempting to load dataset: {dataset_name}")
    try:
        dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)
        df = dataset.to_pandas()
        print(f"Successfully loaded {dataset_name}")
            
        # Ensure date column exists
        date_column = 'created_at' if 'created_at' in df.columns else 'date'
        if date_column not in df.columns:
            print(f"No date column in {dataset_name}. Skipping date filtering.")
            df['created_at'] = pd.NaT
        else:
            # Convert dates to UTC
            df['created_at'] = pd.to_datetime(df[date_column], errors='coerce', utc=True)
            
        # Filter by date range
        if df['created_at'].notna().any():
            df = df[(df['created_at'] >= start_dt) & (df['created_at'] <= end_dt)]
            
        # Filter for stock tickers (case-insensitive)
        stock_pattern = '|'.join([f'\\${t}' for t in stocks] + stocks)  # Match $GME or GME
        df_filtered = df[df['text'].str.contains(stock_pattern, case=False, na=False)]
            
        if not df_filtered.empty:
            df_filtered = df_filtered[['created_at', 'text', 'id']].copy()
            df_filtered['source'] = 'Twitter'
            print(f"Found {len(df_filtered)} tweets from {dataset_name}")
        else:
            print(f"No matching tweets in {dataset_name}")
                
    except requests.exceptions.SSLError as ssl_err:
        print(f"SSLError loading {dataset_name}: {ssl_err}")
        print("Try updating dependencies or checking network settings.")
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
    
    if df_filtered.empty:
        print("No Twitter data retrieved from Hugging Face datasets.")
    
    return df_filtered

def collect_stock_social_data(stocks, start_date, end_date, output_dir):
    """
    Collect Twitter and StockTwits data for specified stocks and save to CSV.
    
    Args:
        stocks (list): List of stock tickers.
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
        output_dir (str): Directory to save output CSV.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Fetch Twitter data from Hugging Face
    df_twitter = fetch_huggingface_twitter_data(stocks, start_date, end_date, output_dir)
    
    # Combine datasets
    # if not df_twitter.empty and not df_stocktwits.empty:
    #     df_combined = pd.concat([df_twitter, df_stocktwits], ignore_index=True)
    # elif if not df_twitter.empty:
    #     df_combined = df_twitter
    # # elif not df_stocktwits.empty:
    # #     df_combined = df_stocktwits
    # else:
    #     df_combined = pd.DataFrame()
    
    # Save to CSV
    if not df_twitter.empty:
        output_path = f"{output_dir}/stock_social_data.csv"
        df_twitter.to_csv(output_path, index=False)
        print(f"Saved data to {output_path}")
    else:
        print("No data collected")
    
    return df_twitter


# Example usage
output_dir = "twitter_data"
collect_stock_social_data(stocks, start_date, end_date, output_dir)

2025-05-20 17:57:36,572 - INFO - Attempting to load dataset: StephanAkkerman/stock-market-tweets-data
2025-05-20 17:57:42,308 - INFO - Successfully loaded StephanAkkerman/stock-market-tweets-data
2025-05-20 17:58:00,040 - INFO - Found 151336 tweets from StephanAkkerman/stock-market-tweets-data
2025-05-20 17:58:00,048 - INFO - Fetching StockTwits data for GME
2025-05-20 17:58:01,589 - INFO - Fetching StockTwits data for AMC
2025-05-20 17:58:03,111 - INFO - Fetching StockTwits data for TLRY
2025-05-20 17:58:05,168 - INFO - Fetching StockTwits data for SNDL
2025-05-20 17:58:07,042 - INFO - Fetching StockTwits data for NAKD
2025-05-20 17:58:07,397 - ERROR - Error fetching StockTwits for NAKD: 404 Client Error: Not Found for url: https://api.stocktwits.com/api/2/streams/symbol/NAKD.json
2025-05-20 17:58:07,401 - INFO - Fetching StockTwits data for ZOM
2025-05-20 17:58:07,684 - ERROR - Error fetching StockTwits for ZOM: 404 Client Error: Not Found for url: https://api.stocktwits.com/api/2/st

Unnamed: 0,created_at,text,id,source
9,2020-04-09 23:56:58+00:00,RT @TDANetwork: 📽️ #TheWatchList panel assesse...,13,Twitter
10,2020-04-09 23:56:51+00:00,$UMRX bouncing. EXTREMELY OVERSOLD #Coronaviru...,14,Twitter
28,2020-04-09 23:55:05+00:00,$AAPL 4h/1h\n\nSometimes these wedges break hi...,32,Twitter
29,2020-04-09 23:54:47+00:00,This week's Expired Signals are now published ...,33,Twitter
30,2020-04-09 23:54:28+00:00,"$SPY $QQQ $VXX $AAPL $BA $MSFT\n\nGuys, I figu...",34,Twitter
...,...,...,...,...
923637,2020-07-16 00:04:02+00:00,RT @TATrades: Quick poll - how much do you (on...,938637,Twitter
923644,2020-07-16 00:03:39+00:00,RT @NukemosS: China retaliating on $AAPL,938644,Twitter
923652,2020-07-16 00:02:21+00:00,lows. This is shaping up for another nice sell...,938652,Twitter
923656,2020-07-16 00:01:48+00:00,@_SeanDavid I could make a good case for why $...,938656,Twitter
