In [None]:
import yfinance as yf
import pandas as pd
from google.colab import drive
import os
from datetime import datetime
import pytz
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Function to load existing tickers from log.csv
def load_existing_tickers(log_file):
    if not os.path.exists(log_file):
        return dict()
    existing_tickers = dict()
    with open(log_file, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:  # Skip the header line
            ticker, last_updated = line.strip().split(',')
            existing_tickers[ticker] = datetime.strptime(last_updated, '%Y-%m-%d')
    return existing_tickers

# Function to update log.csv with new tickers
def update_log(log_file, new_ticker, last_updated):
    with open(log_file, 'a') as file:
        file.write(f"{new_ticker},{last_updated}\n")

def remove_duplicates_from_log(log_file):
    if not os.path.exists(log_file):
        return

    log_data = {}
    with open(log_file, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:
            ticker, last_updated = line.strip().split(',')
            log_data.setdefault(ticker, []).append(last_updated)

    with open(log_file, 'w') as file:
        file.write("Ticker,LastUpdated\n")
        for ticker, dates in log_data.items():
            latest_date = max(dates)
            file.write(f"{ticker},{latest_date}\n")

def convert_timezone(date, time):
  datetime_str = date.split()[0] + " " + time

  # Define the format of the datetime string
  datetime_format = "%Y-%m-%d %I:%M %p"

  # Convert the combined string to a datetime object
  datetime_obj = datetime.strptime(datetime_str, datetime_format)
  return datetime_obj

# Function to fetch and store stock price data
def fetch_stock_data(ticker_list, exchange_list, storage_folder, tracking_directory):
    log_file = os.path.join(tracking_directory, 'log.csv')
    existing_tickers = load_existing_tickers(log_file)
    trading_hours_df = pd.read_csv("/content/drive/MyDrive/findata/trading_hours.csv")

    for ticker, exchange in zip(ticker_list, exchange_list):
        if ticker not in existing_tickers:
            # Fetch data from Yahoo Finance
            df = yf.download(ticker, end=datetime.today().strftime('%Y-%m-%d'))

            # fetch exchange information
            exchange_info = trading_hours_df[trading_hours_df["Symbol"] == exchange]
            df = df.reset_index()
            df["Date_Open"] = df.reset_index()["Date"].apply(lambda x: convert_timezone(str(x), str(exchange_info["Open"].iloc[0])))
            df["Date_Close"] = df.reset_index()["Date"].apply(lambda x: convert_timezone(str(x), str(exchange_info["Close"].iloc[0])))
            df = df.set_index("Date")

            # Group data by year
            data_by_year = df.groupby(df.index.year)

            # Create a folder for each ticker
            ticker_directory = os.path.join(storage_folder, ticker[0], ticker)
            os.makedirs(ticker_directory, exist_ok=True)

            # Save data into separate CSV files for each year in the ticker folder
            for year, year_data in data_by_year:
                year_file_name = os.path.join(ticker_directory, f"{year}.csv")
                year_data.to_csv(year_file_name)

            # Update the log file
            update_log(log_file, ticker, datetime.today().strftime('%Y-%m-%d'))
            print(f"Data for {ticker} saved to findata folder")
        else:
            print(f"Data for {ticker} already exists. Updating...")
            # Fetch data for the existing ticker from the last updated date onwards
            last_updated = existing_tickers[ticker]
            start_date = f"{last_updated.year}-01-01"
            df = yf.download(ticker, start=start_date)

            # Group data by year
            data_by_year = df.groupby(df.index.year)

            # Create a folder for each ticker
            ticker_directory = os.path.join(storage_folder, ticker[0], ticker)
            os.makedirs(ticker_directory, exist_ok=True)

            # Save data into separate CSV files for each year in the ticker folder
            for year, year_data in data_by_year:
                year_file_name = os.path.join(ticker_directory, f"{year}.csv")
                year_data.to_csv(year_file_name)

            # Update the log file
            update_log(log_file, ticker, datetime.today().strftime('%Y-%m-%d'))
            print(f"Data for {ticker} updated from {last_updated.strftime('%Y-%m-%d')} to today.")

if __name__ == "__main__":
    storage_folder = "/content/drive/MyDrive/findata/eoddata"  # Change this to your desired storage folder
    os.makedirs(storage_folder, exist_ok=True)

    # Create a directory to store the tracking file
    tracking_directory = "/content/drive/MyDrive/findata/log"
    os.makedirs(tracking_directory, exist_ok=True)

    # Prompt the user to upload a CSV file with a list of tickers
    #csv_file_path = input("Please enter the path to the CSV file containing tickers: ")

    # if not os.path.exists(csv_file_path):
    #     print("CSV file not found.")
    # else:
        # Read tickers from the CSV file
    ticker_list = ["AAPL", "MSFT", "GOOGL"]#pd.read_csv(csv_file_path)['Ticker'].tolist()
    exchange_list = ["NASDAQ", "NASDAQ", "NASDAQ"]
    fetch_stock_data(ticker_list, exchange_list, storage_folder, tracking_directory)

    # Run the function to remove duplicates from the log file
    log_file = os.path.join(tracking_directory, 'log.csv')
    remove_duplicates_from_log(log_file)


[*********************100%%**********************]  1 of 1 completed
Data for AAPL saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for MSFT saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for GOOGL saved to findata folder


In [None]:
trading_hours_df = pd.read_csv("/content/drive/MyDrive/findata/trading_hours.csv", index_col=0)
def add_pads(x):
  if int(x.split(':')[0]) < 10:
    x = "0" + x
  return x
trading_hours_df["Open"] = trading_hours_df["Open"].apply(add_pads)
trading_hours_df["Close"] = trading_hours_df["Close"].apply(add_pads)

In [None]:
trading_hours_df.to_csv("/content/drive/MyDrive/trading_hours.csv")

In [None]:
trading_hours_df

Unnamed: 0,Symbol,Exchange,Start Day,End Day,Open,Close,Timezone
0,NYSE,New York Stock Exchange,Mon,Fri,09:30 AM,04:00 PM,America/New_York
1,NASDAQ,NASDAQ Stock Exchange,Mon,Fri,09:30 AM,04:00 PM,America/New_York
2,SSE,Shanghai Stock Exchange,Mon,Fri,09:30 AM,03:00 PM,Asia/Shanghai
3,JPX,Tokyo Stock Exchange,Mon,Fri,09:00 AM,03:00 PM,Asia/Tokyo
4,SZSE,Shenzhen Stock Exchange,Mon,Fri,09:30 AM,02:57 PM,Asia/Shanghai
...,...,...,...,...,...,...,...
137,IEX,Investors Exchange,Mon,Fri,09:30 AM,04:00 PM,America/New_York
138,OTC,OTC Markets U.S.,Mon,Fri,09:30 AM,04:00 PM,America/New_York
139,BVCC,Caracas Stock Exchange,Mon,Fri,09:00 AM,01:00 PM,America/Caracas
140,HNX,Hanoi Stock Exchange,Mon,Fri,09:00 AM,02:30 PM,Asia/Ho_Chi_Minh


In [None]:
df = yf.download("AAPL", end=datetime.today().strftime('%Y-%m-%d'))

def convert_timezone(date, time):
  datetime_str = date.split()[0] + " " + time
  return datetime_str

exchange_info = trading_hours_df[trading_hours_df["Symbol"] == "NASDAQ"]
df = df.reset_index()
df["Date_Open"] = df.reset_index()["Date"].apply(lambda x: convert_timezone(str(x), str(exchange_info["Open"].iloc[0])))

[*********************100%%**********************]  1 of 1 completed


In [None]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Date_Open
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.099450,469033600,1980-12-12 09:30 AM
1,1980-12-15,0.122210,0.122210,0.121652,0.121652,0.094261,175884800,1980-12-15 09:30 AM
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087343,105728000,1980-12-16 09:30 AM
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089504,86441600,1980-12-17 09:30 AM
4,1980-12-18,0.118862,0.119420,0.118862,0.118862,0.092099,73449600,1980-12-18 09:30 AM
...,...,...,...,...,...,...,...,...
10785,2023-09-25,174.199997,176.970001,174.149994,176.080002,176.080002,46172700,2023-09-25 09:30 AM
10786,2023-09-26,174.820007,175.199997,171.660004,171.960007,171.960007,64588900,2023-09-26 09:30 AM
10787,2023-09-27,172.619995,173.039993,169.050003,170.429993,170.429993,66921800,2023-09-27 09:30 AM
10788,2023-09-28,169.339996,172.029999,167.619995,170.690002,170.690002,56294400,2023-09-28 09:30 AM
