In [None]:
import pandas as pd
import yfinance as yf
import mplfinance as mpf
import os
from datetime import datetime
import requests
import glob
import re
from tqdm import tqdm


In [None]:
"""
Fetch detailed company information from Yahoo Finance for each ticker file.
Parameters:
    tickers_dir: Directory containing CSV files with ticker symbols.
    output_dir: Directory where the stock info CSVs will be saved.
    limit: 
        - If int, only that many tickers will be processed.
        - If 'all' or a number larger than total, all tickers will be processed.
"""
def fetch_stock_info(tickers_dir="tickers", output_dir="stocks_info", limit="all"):

    os.makedirs(output_dir, exist_ok=True)

    for file in os.listdir(tickers_dir):
        if not file.endswith(".csv"):
            continue
        
        input_path = os.path.join(tickers_dir, file)
        output_path = os.path.join(output_dir, file.replace("_tickers", "_info"))

        df = pd.read_csv(input_path)
        symbols = df["Symbol"].dropna().unique().tolist()

        # Apply limit
        if isinstance(limit, int) and limit < len(symbols):
            symbols = symbols[:limit]
            print(f"Processing first {limit} tickers from {file}.")
        else:
            print(f"Processing all {len(symbols)} tickers from {file}.")

        print(f"\nFetching data for {len(symbols)} symbols...")

        info_list = []

        for sym in tqdm(symbols, desc=f"{file}"):
            try:
                ticker = yf.Ticker(sym)
                info = ticker.info
                if info:
                    info_list.append(info)
            except Exception as e:
                print(f"Warning: Could not retrieve data for {sym}: {e}")

        if info_list:
            info_df = pd.DataFrame(info_list)
            info_df.to_csv(output_path, index=False, encoding="utf-8-sig")
            print(f"Saved {len(info_df)} records to {output_path}")
        else:
            print(f"No data retrieved for {file}.")

In [None]:
fetch_stock_info("tickers", "stocks_info", limit="all")

In [None]:
for col in pd.read_csv("stocks_info/bist100_info.csv").columns: print(col)

In [None]:
def download_index_prices(index_name, csv_path, limit="all", start_date="auto", end_date="auto"):
    # Output directory
    data_dir = f"stocks_price_data/{index_name}_stocks"
    os.makedirs(data_dir, exist_ok=True)
    
    # Date Control
    if start_date == "auto":
        start_date = "1900-01-01"
    if end_date == "auto":
        end_date = datetime.today().strftime("%Y-%m-%d")

    # Read tickers
    df = pd.read_csv(csv_path)
    if "Symbol" not in df.columns:
        print(f"No 'Symbol' column found in {csv_path}")
        return

    symbols = df["Symbol"].astype(str).str.strip().tolist()
    
    # Limit Control
    total = len(symbols)
    if isinstance(limit, int) and limit > 0 and limit < total:
        print(f"Processing first {limit} tickers from {index_name.upper()} (total {total}).")
        symbols = symbols[:limit]
    else:
        print(f"Processing all {total} tickers from {index_name.upper()}.")
    
    print(f"\nDownloading {len(symbols)} tickers from {index_name.upper()}...")

    for symbol in symbols:
        try:
            df_data = yf.download(symbol, start=start_date, end=end_date)
            if not df_data.empty:
                file_path = os.path.join(data_dir, f"{symbol}.csv")
                df_data.to_csv(file_path)
                print(f"{symbol}: {len(df_data)} rows saved.")
            else:
                print(f"{symbol}: no data found.")
        except Exception as e:
            print(f"{symbol} error: {e}")

    print(f"All data saved to '{data_dir}/'\n")

In [None]:
index_files = {
    "sp500":      "tickers/sp500_tickers.csv",
    "dowjones":   "tickers/dowjones_tickers.csv",
    "nasdaq100":  "tickers/nasdaq100_tickers.csv",
    "nifty50":    "tickers/nifty50_tickers.csv",
    "bist100":    "tickers/bist100_tickers.csv",
    "sse":        "tickers/sse_tickers.csv",
}

for name, path in index_files.items():
    if os.path.exists(path):
        download_index_prices(name, path, limit="all")
    else:
        print(f"Skipping {name}, file not found: {path}")

In [None]:
df = pd.read_csv(os.path.join("stocks_price_data/sp500_stocks", "AAPL.csv"))
print(df.head())
print(df.columns)

In [None]:
"""
    Clean all downloaded stock CSVs inside subfolders (sp500_stocks, dowjones_stocks, etc.)
    Removes header issues and converts columns to standard format.
"""
    
def clean_all_stock_data(base_dir="stocks_price_data"):
    # Get all subfolders
    subfolders = [f.path for f in os.scandir(base_dir) if f.is_dir()]

    for folder in subfolders:
        csv_files = glob.glob(os.path.join(folder, "*.csv"))
        print(f"\n=== Cleaning {len(csv_files)} files in {os.path.basename(folder)} ===")

        for file in csv_files:
            symbol = os.path.basename(file).replace(".csv", "")

            try:
                df_raw = pd.read_csv(file)

                # If file is too short or incomplete, skip
                if len(df_raw) < 3:
                    print(f"{symbol}: not enough data, skipped.")
                    continue

                # Drop the first two rows
                df_clean = df_raw.drop([0, 1])

                # Fix column names
                df_clean.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]

                # Convert date to datetime
                df_clean["Date"] = pd.to_datetime(df_clean["Date"], errors="coerce")

                # Convert numeric columns to float
                for col in ["Open", "High", "Low", "Close", "Volume"]:
                    df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")

                # Reset index
                df_clean.reset_index(drop=True, inplace=True)

                # Overwrite the original file
                df_clean.to_csv(file, index=False)

                print(f"{symbol}: {len(df_clean)} rows cleaned and saved.")

            except Exception as e:
                print(f"{symbol}: error â†’ {e}")

    print("\nAll files cleaned successfully!")

clean_all_stock_data("stocks_price_data")

In [None]:
df = pd.read_csv(os.path.join("stocks_price_data/sp500_stocks", "AAPL.csv"))
print(df.head())
print(df.columns)

In [None]:
def plot_stock(file_path, symbol, period_days=180):
    # Read CSV
    df = pd.read_csv(f"{file_path}/{symbol}.csv", parse_dates=["Date"])
    df.set_index("Date", inplace=True)

    # Get last period_days data
    df_tail = df.tail(period_days)

    # Create plot
    mpf.plot(
        df_tail,
        type="candle",
        title=f"{symbol.upper()} (Last {period_days} Days)",
        ylabel="Price (USD)",
        style="yahoo",
        datetime_format="%Y-%m-%d",
        xrotation=15,
        figratio=(16, 9),
        figscale=1.3,
        tight_layout=True
    )


In [None]:
# Example 
plot_stock("stocks_price_data/sp500_stocks/", "AAPL", 180)