In [1]:
import pandas as pd
from pathlib import Path

In [2]:
DATA_ROOT = Path("data")

TRENDS_DATA = DATA_ROOT / "google_trends"
STOCKS_DATA = DATA_ROOT / "market"

In [4]:
from typing import Optional
import logging

def safe_read_csv(file_path)->Optional[pd.DataFrame]:
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        logging.warning(f"Error reading {file_path}: {e}")
        return None

In [5]:
stocks_dfs = [safe_read_csv(f) for f in STOCKS_DATA.glob("*.csv")]
trends_dfs = [safe_read_csv(f) for f in TRENDS_DATA.glob("*.csv")]

stocks_dfs = [df for df in stocks_dfs if df is not None]
trends_dfs = [df for df in trends_dfs if df is not None]

assert all(df is not None for df in stocks_dfs), "Some stock DataFrames failed to load"
assert all(df is not None for df in trends_dfs), "Some trends DataFrames failed to load"

num_stocks_files = len(list(STOCKS_DATA.glob("*.csv")))
num_trends_files = len(list(TRENDS_DATA.glob("*.csv")))

print(f"Loaded {len(stocks_dfs)}/{num_stocks_files} stock data files.")
print(f"Loaded {len(trends_dfs)}/{num_trends_files} trends data files.")


def parse_stock_field(df:pd.DataFrame, field:str)->Optional[pd.Series]:
    # Check if the field exists in the DataFrame
    if field in df.columns:
        df_ = df[field].iloc[2:]
        df_ = df_.reset_index(drop=True)
        df_.name = df.iloc[1][field]
        return df_
    else:
        return None

# verify that all the DataFrames have the same shape or that
volume_dfs = [parse_stock_field(df, 'Volume') for df in stocks_dfs]
price_dfs = [parse_stock_field(df, 'Close') for df in stocks_dfs]



Loaded 103/103 stock data files.
Loaded 579/699 trends data files.


In [17]:
stocks_shapes = pd.Series({df.iloc[0, 1]: df.shape[0] for df in stocks_dfs})
common_length = int(stocks_shapes.value_counts().index[0])

reg_stocks_dfs = [df for df in stocks_dfs if df.shape[0] == common_length]
len(reg_stocks_dfs)

78

In [19]:
common_length

1258

In [18]:
stocks_shapes.value_counts()

1258    75
1257     2
236      1
1185     1
1256     1
120      1
1245     1
245      1
492      1
1103     1
Name: count, dtype: int64

In [None]:
volume_dfs_shapes={df.}