# Securities Master Database
Aim of the code here is to maintain a database of OHLC data from different sources. Will explore tick level data in the future. Storage in google drive in csv format

In [13]:
import yfinance as yf
import pandas as pd
from google.colab import drive
import os
from datetime import datetime
import pytz
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [14]:
# Function to load existing tickers from log.csv
def load_existing_tickers(log_file):
    if not os.path.exists(log_file):
        return dict()
    existing_tickers = dict()
    with open(log_file, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:  # Skip the header line
            ticker, last_updated = line.strip().split(',')
            existing_tickers[ticker] = datetime.strptime(last_updated, '%Y-%m-%d')
    return existing_tickers #Return tickers: last_udpated date in dictionary form

# Function to update log.csv with new tickers and new updated dates
def update_log(log_file, new_ticker, last_updated):
    with open(log_file, 'a') as file:
        file.write(f"{new_ticker},{last_updated}\n")

#Remove duplicates from log file, as update_log will create duplicate entries
def remove_duplicates_from_log(log_file):
    if not os.path.exists(log_file):
        return
    log_data = {}
    with open(log_file, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:
            ticker, last_updated = line.strip().split(',')
            log_data.setdefault(ticker, []).append(last_updated)

    with open(log_file, 'w') as file:
        file.write("Ticker,LastUpdated\n")
        for ticker, dates in log_data.items():
            latest_date = max(dates)
            file.write(f"{ticker},{latest_date}\n")

# seems like this function is incomplete
def convert_timezone(date, time):
  datetime_str = date.split()[0] + " " + time

  # Define the format of the datetime string
  datetime_format = "%Y-%m-%d %I:%M %p"

  # Convert the combined string to a datetime object
  datetime_obj = datetime.strptime(datetime_str, datetime_format)
  return datetime_obj

# Function to fetch and store stock price data from yahoo finance, and add the open, close time of the security
def fetch_yahoo_data(ticker_list, exchange_list, storage_folder, tracking_directory):
    log_file = os.path.join(tracking_directory, 'log.csv')
    existing_tickers = load_existing_tickers(log_file)
    trading_hours_df = pd.read_csv("/content/drive/MyDrive/findata/trading_hours.csv")

    for ticker, exchange in zip(ticker_list, exchange_list):
        if ticker not in existing_tickers:
            # Fetch data from Yahoo Finance
            df = yf.download(ticker, end=datetime.today().strftime('%Y-%m-%d'))

            # fetch exchange information
            exchange_info = trading_hours_df[trading_hours_df["Symbol"] == exchange]
            df = df.reset_index()
            df["Date_Open"] = df.reset_index()["Date"].apply(lambda x: convert_timezone(str(x), str(exchange_info["Open"].iloc[0])))
            df["Date_Close"] = df.reset_index()["Date"].apply(lambda x: convert_timezone(str(x), str(exchange_info["Close"].iloc[0])))
            df = df.set_index("Date")

            # Group data by year
            data_by_year = df.groupby(df.index.year)

            # Create a folder for each ticker
            ticker_directory = os.path.join(storage_folder, ticker[0], ticker)
            os.makedirs(ticker_directory, exist_ok=True)

            # Save data into separate CSV files for each year in the ticker folder
            for year, year_data in data_by_year:
                year_file_name = os.path.join(ticker_directory, f"{year}.csv")
                year_data.to_csv(year_file_name)

            # Update the log file
            update_log(log_file, ticker, datetime.today().strftime('%Y-%m-%d'))
            print(f"Data for {ticker} saved to findata folder")
        else:
            print(f"Data for {ticker} already exists. Updating...")
            # Fetch data for the existing ticker from the last updated date onwards
            last_updated = existing_tickers[ticker]
            start_date = f"{last_updated.year}-01-01"
            df = yf.download(ticker, start=start_date)

            # Group data by year
            data_by_year = df.groupby(df.index.year)

            # Create a folder for each ticker
            ticker_directory = os.path.join(storage_folder, ticker[0], ticker)
            os.makedirs(ticker_directory, exist_ok=True)

            # Save data into separate CSV files for each year in the ticker folder
            for year, year_data in data_by_year:
                year_file_name = os.path.join(ticker_directory, f"{year}.csv")
                year_data.to_csv(year_file_name)

            # Update the log file
            update_log(log_file, ticker, datetime.today().strftime('%Y-%m-%d'))
            print(f"Data for {ticker} updated from {last_updated.strftime('%Y-%m-%d')} to today.")



In [15]:
if __name__ == "__main__":
    storage_folder = "/content/drive/MyDrive/findata/OHLC_yahoo"  # Change this to your desired storage folder
    os.makedirs(storage_folder, exist_ok=True)

    # Create a directory to store the tracking file
    tracking_directory = "/content/drive/MyDrive/findata/log"
    os.makedirs(tracking_directory, exist_ok=True)

    csv_file_path = "/content/drive/MyDrive/findata/ticker_master.csv"

    if not os.path.exists(csv_file_path):
        print("CSV file not found.")
    else:
        # Read tickers from the CSV file
        tickers = pd.read_csv(csv_file_path)
        ticker_list = tickers['Ticker_yahoo'].tolist()[99:1800]
        exchange_list = tickers['Exchange'].tolist()[99:1800]

    fetch_yahoo_data(ticker_list, exchange_list, storage_folder, tracking_directory)

    # Run the function to remove duplicates from the log file
    log_file = os.path.join(tracking_directory, 'log.csv')
    remove_duplicates_from_log(log_file)

[*********************100%%**********************]  1 of 1 completed
Data for AEIS saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AEMD saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AENT saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AENTW saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AEP saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AEY saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AEYE saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AEZS saved to findata folder
[*********************100%%**********************]  1 of 1 completed
Data for AFAR saved to findata folder
[*********************100%%**********