# Loads Top 100 NASDAQ daily stock prices DATASET from Kaggle + yfinance DATASET


In [4]:
# import shutil
# import os 

# # Delete the entire yfinance folder and its contents
# shutil.rmtree("../data/raw/yfinance", ignore_errors=True)

# # Remove Kaggle CSV files but keep the folder
# for file in os.listdir("../data/raw"):
#     if file.endswith(".csv"):
#         os.remove(f"../data/raw/{file}")

# print("Old CSV files deleted.")

import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import yfinance as yf

# Authenticate Kaggle API
api = KaggleApi()
api.authenticate()

# Download Kaggle dataset
dataset = "svaningelgem/nasdaq-100-daily-stock-prices"
api.dataset_download_files(dataset, path="../data/raw", unzip=True)

# Get the first CSV file from Kaggle dataset
downloaded_files = os.listdir("../data/raw")
csv_file = [f for f in downloaded_files if f.endswith('.csv')][0]

# Load Kaggle dataset
df_kaggle_data = pd.read_csv(f"../data/raw/{csv_file}")
print(f"Loaded Kaggle dataset with {len(df_kaggle_data)} rows")
print(df_kaggle_data.head(5))
print(df_kaggle_data.tail(5))

# Download stock prices from Yahoo Finance for all NASDAQ-100 stocks
from tqdm import tqdm
import time

# NASDAQ-100 tickers (2024) with special cases handled
tickers = [
    "AAPL", "MSFT", "AMZN", "NVDA", "META", "GOOGL", "GOOG",
    "TSLA", "AVGO", "COST", "PEP", "ADBE", "CSCO", "TMUS",
    "CMCSA", "NFLX", "HON", "TXN", "QCOM", "INTC", "AMD",
    "GILD", "INTU", "AMGN", "BKNG", "ADP", "VRTX", "REGN",
    "ISRG", "ADI", "PANW", "MU", "MDLZ", "PYPL", "SBUX",
    "MELI", "KLAC", "SNPS", "CDNS", "ASML", "CHTR", "LRCX",
    "MAR", "WDAY", "ABNB", "NXPI", "ORLY", "MNST", "ADSK",
    "KDP", "PDD", "AEP", "DXCM", "CTAS", "ROST", "BIIB",
    "IDXX", "MCHP", "MRVL", "CSX", "CRWD", "FAST", "PCAR",
    "ODFL", "KHC", "PAYX", "VRSK", "EXC", "FANG", "CPRT",
    "XEL", "MRNA", "ANSS", "CEG", "AZN", "DASH", "TEAM",
    "ZS", "DLTR", "EA", "FTNT", "GFS", "HES", "ILMN",
    "LULU", "ON", "SIRI", "WBD", "WBA", "BKR", "DDOG",
    "ENPH", "FISV", "GEHC", "JD", "LCID", "RIVN", "SWKS",
    "TTD", "VRSN", "ZM"
]

# Special ticker mappings for Yahoo Finance
ticker_mappings = {
    'BRK.B': 'BRK-B',
    'BF.B': 'BF-B'
}

# Track results
results = {
    'success': [],
    'fail': []
}

# Download with progress bar
for ticker in tqdm(tickers, desc="Downloading Stocks"):
    try:
        # Get correct ticker symbol for Yahoo Finance
        yf_ticker = ticker_mappings.get(ticker, ticker)
        
        # Download data (1 year history)
        df = yf.download(yf_ticker, period="1y", progress=False)
        
        # Save to CSV in your existing yfinance folder
        df.to_csv(f"../data/raw/yfinance/{ticker}.csv")
        results['success'].append(ticker)
        
        # Brief pause to avoid rate limits
        time.sleep(0.2)
        
    except Exception as e:
        results['fail'].append((ticker, str(e)))

# Print summary
print(f"\n✅ Successfully downloaded {len(results['success'])} stocks")
print(f"❌ Failed to download {len(results['fail'])} stocks")

if results['fail']:
    print("\nFailed downloads:")
    for ticker, error in results['fail']:
        print(f"- {ticker}: {error}")

# Quick verification
print("\nSample downloaded files:", os.listdir("../data/raw/yfinance")[:5])






# import pandas as pd
# import yfinance as yf
# from tqdm import tqdm
# import time
# import os
# from datetime import datetime, timedelta
# from kaggle.api.kaggle_api_extended import KaggleApi
# import logging

# # Configure logging
# logging.basicConfig(filename='stock_downloader.log', level=logging.INFO)

# class StockDataDownloader:
#     def __init__(self):
#         self.tickers = [
#     "AAPL", "MSFT", "AMZN", "NVDA", "META", "GOOGL", "GOOG",
#     "TSLA", "AVGO", "COST", "PEP", "ADBE", "CSCO", "TMUS",
#     "CMCSA", "NFLX", "HON", "TXN", "QCOM", "INTC", "AMD",
#     "GILD", "INTU", "AMGN", "BKNG", "ADP", "VRTX", "REGN",
#     "ISRG", "ADI", "PANW", "MU", "MDLZ", "PYPL", "SBUX",
#     "MELI", "KLAC", "SNPS", "CDNS", "ASML", "CHTR", "LRCX",
#     "MAR", "WDAY", "ABNB", "NXPI", "ORLY", "MNST", "ADSK",
#     "KDP", "PDD", "AEP", "DXCM", "CTAS", "ROST", "BIIB",
#     "IDXX", "MCHP", "MRVL", "CSX", "CRWD", "FAST", "PCAR",
#     "ODFL", "KHC", "PAYX", "VRSK", "EXC", "FANG", "CPRT",
#     "XEL", "MRNA", "ANSS", "CEG", "AZN", "DASH", "TEAM",
#     "ZS", "DLTR", "EA", "FTNT", "GFS", "HES", "ILMN",
#     "LULU", "ON", "SIRI", "WBD", "WBA", "BKR", "DDOG",
#     "ENPH", "FISV", "GEHC", "JD", "LCID", "RIVN", "SWKS",
#     "TTD", "VRSN", "ZM"
# ]
#         self.ticker_mappings = {'BRK.B': 'BRK-B', 'BF.B': 'BF-B'}
#         self.data_dir = "../data/raw/yfinance"
#         os.makedirs(self.data_dir, exist_ok=True)

#     def get_kaggle_data(self):
#         """Download historical data from Kaggle as backup"""
#         try:
#             api = KaggleApi()
#             api.authenticate()
#             api.dataset_download_files(
#                 "svaningelgem/nasdaq-100-daily-stock-prices",
#                 path="../data/raw",
#                 unzip=True
#             )
#             logging.info("Kaggle data downloaded successfully")
#         except Exception as e:
#             logging.error(f"Kaggle download failed: {str(e)}")

#     def get_yf_ticker(self, ticker):
#         """Handle special ticker cases"""
#         return self.ticker_mappings.get(ticker, ticker)

#     def download_stock(self, ticker, retries=3):
#         """Robust download with retry logic"""
#         yf_ticker = self.get_yf_ticker(ticker)
        
#         for attempt in range(retries):
#             try:
#                 # Get data with 1 day buffer for timezones
#                 end_date = datetime.today() + timedelta(days=1)
#                 df = yf.download(
#                     yf_ticker,
#                     start="2020-01-01",
#                     end=end_date,
#                     progress=False,
#                     threads=True  # Parallel downloads
#                 )
                
#                 if df.empty:
#                     raise ValueError("Empty DataFrame returned")
                    
#                 # Add metadata
#                 df['ticker'] = ticker
#                 df['data_source'] = 'yfinance'
#                 df['last_updated'] = datetime.now()
                
#                 # Save as parquet (faster, preserves dtypes)
#                 df.to_parquet(f"{self.data_dir}/{ticker}.parquet")
#                 return True
                
#             except Exception as e:
#                 if attempt == retries - 1:
#                     logging.error(f"Failed {ticker} after {retries} attempts: {str(e)}")
#                     return False
#                 time.sleep(2 ** attempt)  # Exponential backoff

#     def run(self):
#         """Main execution with progress tracking"""
#         success = []
#         fail = []
        
#         for ticker in tqdm(self.tickers, desc="Downloading Stocks"):
#             if self.download_stock(ticker):
#                 success.append(ticker)
#             else:
#                 fail.append(ticker)
#             time.sleep(0.5)  # Respect rate limits

#         # Generate report
#         report = {
#             "timestamp": datetime.now(),
#             "success_count": len(success),
#             "failed_tickers": fail,
#             "success_rate": len(success)/len(self.tickers)
#         }
        
#         pd.DataFrame([report]).to_csv("../data/download_report.csv", mode='a')
#         logging.info(f"Download completed. Success: {len(success)}, Failed: {len(fail)}")
#         return report

# if __name__ == "__main__":
#     downloader = StockDataDownloader()
#     downloader.get_kaggle_data()  # Backup source
#     result = downloader.run()
#     print(result)


Dataset URL: https://www.kaggle.com/datasets/svaningelgem/nasdaq-100-daily-stock-prices
Loaded Kaggle dataset with 11135 rows
  ticker        date  open  high   low  close
0   AAPL  1980-12-12  0.13  0.13  0.13   0.13
1   AAPL  1980-12-15  0.12  0.12  0.12   0.12
2   AAPL  1980-12-16  0.11  0.11  0.11   0.11
3   AAPL  1980-12-17  0.12  0.12  0.12   0.12
4   AAPL  1980-12-18  0.12  0.12  0.12   0.12
      ticker        date    open    high     low   close
11130   AAPL  2025-03-24  221.00  221.48  218.58  220.73
11131   AAPL  2025-03-25  220.77  224.10  220.08  223.75
11132   AAPL  2025-03-26  223.51  225.02  220.47  221.53
11133   AAPL  2025-03-27  221.39  224.99  220.60  223.85
11134   AAPL  2025-03-28  221.67  223.81  217.68  217.90


Downloading Stocks: 100%|██████████| 101/101 [00:38<00:00,  2.64it/s]


✅ Successfully downloaded 101 stocks
❌ Failed to download 0 stocks

Sample downloaded files: ['AAPL.csv', 'ABNB.csv', 'ADBE.csv', 'ADI.csv', 'ADP.csv']



