# Loads Top 100 NASDAQ daily stock prices DATASET from Kaggle + yfinance DATASET


In [1]:
# import shutil
# import os 

# # Delete the entire yfinance folder and its contents
# shutil.rmtree("../data/raw/yfinance", ignore_errors=True)

# # Remove Kaggle CSV files but keep the folder
# for file in os.listdir("../data/raw"):
#     if file.endswith(".csv"):
#         os.remove(f"../data/raw/{file}")

# print("Old CSV files deleted.")

import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import yfinance as yf

# api = KaggleApi()
# api.authenticate()

# dataset = "svaningelgem/nasdaq-100-daily-stock-prices"
# api.dataset_download_files(dataset, path="../data/raw", unzip=True)

# downloaded_files = os.listdir("../data/raw")
# csv_file = [f for f in downloaded_files if f.endswith('.csv')][0]

# df_kaggle_data = pd.read_csv(f"../data/raw/{csv_file}")
# print(f"Loaded Kaggle dataset with {len(df_kaggle_data)} rows")
# print(df_kaggle_data.head(5))
# print(df_kaggle_data.tail(5))

from tqdm import tqdm
import time

tickers = [
    "AAPL", "MSFT", "AMZN", "NVDA", "META", "GOOGL", "GOOG",
    "TSLA", "AVGO", "COST", "PEP", "ADBE", "CSCO", "TMUS",
    "CMCSA", "NFLX", "HON", "TXN", "QCOM", "INTC", "AMD",
    "GILD", "INTU", "AMGN", "BKNG", "ADP", "VRTX", "REGN",
    "ISRG", "ADI", "PANW", "MU", "MDLZ", "PYPL", "SBUX",
    "MELI", "KLAC", "SNPS", "CDNS", "ASML", "CHTR", "LRCX",
    "MAR", "WDAY", "ABNB", "NXPI", "ORLY", "MNST", "ADSK",
    "KDP", "PDD", "AEP", "DXCM", "CTAS", "ROST", "BIIB",
    "IDXX", "MCHP", "MRVL", "CSX", "CRWD", "FAST", "PCAR",
    "ODFL", "KHC", "PAYX", "VRSK", "EXC", "FANG", "CPRT",
    "XEL", "MRNA", "ANSS", "CEG", "AZN", "DASH", "TEAM",
    "ZS", "DLTR", "EA", "FTNT", "GFS", "HES", "ILMN",
    "LULU", "ON", "SIRI", "WBD", "WBA", "BKR", "DDOG",
    "ENPH", "FISV", "GEHC", "JD", "LCID", "RIVN", "SWKS",
    "TTD", "VRSN", "ZM"
]

ticker_mappings = {
    'BRK.B': 'BRK-B',
    'BF.B': 'BF-B'
}

results = {
    'success': [],
    'fail': []
}

for ticker in tqdm(tickers, desc="Downloading Stocks"):
    try:
        yf_ticker = ticker_mappings.get(ticker, ticker)
        
        df = yf.download(yf_ticker, period="1y", progress=False)
        
        df.to_csv(f"../data/raw/yfinance/{ticker}.csv")
        results['success'].append(ticker)
        
        time.sleep(0.2)
        
    except Exception as e:
        results['fail'].append((ticker, str(e)))

print(f"\n✅ Successfully downloaded {len(results['success'])} stocks")
print(f"❌ Failed to download {len(results['fail'])} stocks")

if results['fail']:
    print("\nFailed downloads:")
    for ticker, error in results['fail']:
        print(f"- {ticker}: {error}")

print("\nSample downloaded files:", os.listdir("../data/raw/yfinance")[:5])

Downloading Stocks:   0%|          | 0/101 [00:00<?, ?it/s]

YF.download() has changed argument auto_adjust default to True


Downloading Stocks:  91%|█████████ | 92/101 [00:57<00:04,  1.99it/s]
1 Failed download:
['FISV']: YFPricesMissingError('possibly delisted; no price data found  (period=1y) (Yahoo error = "No data found, symbol may be delisted")')
Downloading Stocks: 100%|██████████| 101/101 [01:19<00:00,  1.27it/s]


✅ Successfully downloaded 101 stocks
❌ Failed to download 0 stocks

Sample downloaded files: ['.gitkeep', 'AAPL.csv', 'ABNB.csv', 'ADBE.csv', 'ADI.csv']





### `01_data_importation.ipynb`

#### 📌 Purpose
- Downloads two datasets:
  1. **NASDAQ-100 daily stock prices** (CSV) from Kaggle
  2. **1-year historical data** for 100 NASDAQ-listed stocks using Yahoo Finance (`yfinance`)

---

#### 🔑 Key Components
1. **Kaggle Dataset Download**:
   - Uses `KaggleApi` to fetch [svaningelgem/nasdaq-100-daily-stock-prices](https://www.kaggle.com/datasets/svaningelgem/nasdaq-100-daily-stock-prices)
   - Saves to `../data/raw/` and loads into a pandas DataFrame.

2. **Yahoo Finance Downloader**:
   - Fetches 1-year data for 100 hardcoded tickers (e.g., AAPL, MSFT).
   - Handles ticker name mappings (e.g., `BRK.B` → `BRK-B`).
   - Saves each as a separate CSV in `../data/raw/yfinance/`.

3. **Error Handling**:
   - Tracks successes/failures with a `results` dictionary.
   - Implements a 0.2s delay between requests to avoid rate limits.

---

#### ⚙️ Technical Details
- **Inputs**: 
  - Kaggle dataset name (hardcoded).
  - List of 100 NASDAQ tickers (hardcoded).
- **Outputs**: 
  - Kaggle CSV → Loaded into `df_kaggle_data`.
  - Individual CSV files per ticker in `yfinance/` folder.
- **Dependencies**:
  ```python
  pandas, kaggle_api, yfinance, tqdm, os, time