# Loads Top 100 NASDAQ daily stock prices DATASET from Kaggle + yfinance DATASET


In [None]:
# import shutil
# import os 

# # Delete the entire yfinance folder and its contents
# shutil.rmtree("../data/raw/yfinance", ignore_errors=True)

# # Remove Kaggle CSV files but keep the folder
# for file in os.listdir("../data/raw"):
#     if file.endswith(".csv"):
#         os.remove(f"../data/raw/{file}")

# print("Old CSV files deleted.")

import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import yfinance as yf

api = KaggleApi()
api.authenticate()

dataset = "svaningelgem/nasdaq-100-daily-stock-prices"
api.dataset_download_files(dataset, path="../data/raw", unzip=True)

downloaded_files = os.listdir("../data/raw")
csv_file = [f for f in downloaded_files if f.endswith('.csv')][0]

df_kaggle_data = pd.read_csv(f"../data/raw/{csv_file}")
print(f"Loaded Kaggle dataset with {len(df_kaggle_data)} rows")
print(df_kaggle_data.head(5))
print(df_kaggle_data.tail(5))

from tqdm import tqdm
import time

tickers = [
    "AAPL", "MSFT", "AMZN", "NVDA", "META", "GOOGL", "GOOG",
    "TSLA", "AVGO", "COST", "PEP", "ADBE", "CSCO", "TMUS",
    "CMCSA", "NFLX", "HON", "TXN", "QCOM", "INTC", "AMD",
    "GILD", "INTU", "AMGN", "BKNG", "ADP", "VRTX", "REGN",
    "ISRG", "ADI", "PANW", "MU", "MDLZ", "PYPL", "SBUX",
    "MELI", "KLAC", "SNPS", "CDNS", "ASML", "CHTR", "LRCX",
    "MAR", "WDAY", "ABNB", "NXPI", "ORLY", "MNST", "ADSK",
    "KDP", "PDD", "AEP", "DXCM", "CTAS", "ROST", "BIIB",
    "IDXX", "MCHP", "MRVL", "CSX", "CRWD", "FAST", "PCAR",
    "ODFL", "KHC", "PAYX", "VRSK", "EXC", "FANG", "CPRT",
    "XEL", "MRNA", "ANSS", "CEG", "AZN", "DASH", "TEAM",
    "ZS", "DLTR", "EA", "FTNT", "GFS", "HES", "ILMN",
    "LULU", "ON", "SIRI", "WBD", "WBA", "BKR", "DDOG",
    "ENPH", "FISV", "GEHC", "JD", "LCID", "RIVN", "SWKS",
    "TTD", "VRSN", "ZM"
]

ticker_mappings = {
    'BRK.B': 'BRK-B',
    'BF.B': 'BF-B'
}

results = {
    'success': [],
    'fail': []
}

# Download with progress bar
for ticker in tqdm(tickers, desc="Downloading Stocks"):
    try:
        yf_ticker = ticker_mappings.get(ticker, ticker)
        
        df = yf.download(yf_ticker, period="1y", progress=False)
        
        df.to_csv(f"../data/raw/yfinance/{ticker}.csv")
        results['success'].append(ticker)
        
        time.sleep(0.2)
        
    except Exception as e:
        results['fail'].append((ticker, str(e)))

# Print summary
print(f"\n✅ Successfully downloaded {len(results['success'])} stocks")
print(f"❌ Failed to download {len(results['fail'])} stocks")

if results['fail']:
    print("\nFailed downloads:")
    for ticker, error in results['fail']:
        print(f"- {ticker}: {error}")

# Quick verification
print("\nSample downloaded files:", os.listdir("../data/raw/yfinance")[:5])