In [1]:
import pandas as pd
from pathlib import Path

In [8]:
DATA_ROOT = Path("data")

TRENDS_DATA = DATA_ROOT / "google_trends"
STOCKS_DATA = DATA_ROOT / "market"

In [11]:
list(TRENDS_DATA.glob("*.csv"))

[PosixPath('data/google_trends/Dyson_Vacuum_vs_competitor.csv'),
 PosixPath('data/google_trends/Eli_Lilly_stock_price.csv'),
 PosixPath('data/google_trends/Sonos_Speaker_sales_in_Russia.csv'),
 PosixPath('data/google_trends/Galaxy_Buds_sales_in_Red_Sea.csv'),
 PosixPath('data/google_trends/buy_GSK_shares.csv'),
 PosixPath('data/google_trends/new_AWS_release_date.csv'),
 PosixPath('data/google_trends/Pinterest_response_to_Yemen.csv'),
 PosixPath('data/google_trends/war_in_Iran.csv'),
 PosixPath('data/google_trends/gender_equality_stocks.csv'),
 PosixPath('data/google_trends/income_inequality.csv'),
 PosixPath('data/google_trends/impact_of_retail_sales.csv'),
 PosixPath('data/google_trends/Roomba_sales_in_West_Bank.csv'),
 PosixPath('data/google_trends/Walmart_earnings_report.csv'),
 PosixPath('data/google_trends/Elon_Musk_on_cancel_culture.csv'),
 PosixPath('data/google_trends/Unity_layoffs.csv'),
 PosixPath('data/google_trends/Andrew_Tate.csv'),
 PosixPath('data/google_trends/Iraq.csv'

In [20]:
from typing import Optional
import logging

def safe_read_csv(file_path)->Optional[pd.DataFrame]:
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        logging.warning(f"Error reading {file_path}: {e}")
        return None

In [48]:
stocks_dfs = [safe_read_csv(f) for f in STOCKS_DATA.glob("*.csv")]
trends_dfs = [safe_read_csv(f) for f in TRENDS_DATA.glob("*.csv")]

stocks_dfs = [df for df in stocks_dfs if df is not None]
trends_dfs = [df for df in trends_dfs if df is not None]

assert all(df is not None for df in stocks_dfs), "Some stock DataFrames failed to load"
assert all(df is not None for df in trends_dfs), "Some trends DataFrames failed to load"

num_stocks_files = len(list(STOCKS_DATA.glob("*.csv")))
num_trends_files = len(list(TRENDS_DATA.glob("*.csv")))

print(f"Loaded {len(stocks_dfs)}/{num_stocks_files} stock data files.")
print(f"Loaded {len(trends_dfs)}/{num_trends_files} trends data files.")


def parse_stock_field(df:pd.DataFrame, field:str)->Optional[pd.Series]:
    # Check if the field exists in the DataFrame
    if field in df.columns:
        df_ = df[field].iloc[2:]
        df_ = df_.reset_index(drop=True)
        df_.name = df.iloc[1][field]
        return df_
    else:
        return None

# verify that all the DataFrames have the same shape or that
volume_dfs = [parse_stock_field(df, 'Volume') for df in stocks_dfs]
price_dfs = [parse_stock_field(df, 'Close') for df in stocks_dfs]



Loaded 26/26 stock data files.
Loaded 579/699 trends data files.


In [50]:
for df in volume_dfs:
    assert df.shape == volume_dfs[0].shape, f"Volume DataFrames have different shapes: {df.shape} vs {volume_dfs[0].shape} with {df.name}"

AssertionError: Volume DataFrames have different shapes: (1256,) vs (1255,) with nan

In [53]:
stocks_dfs[0]

Unnamed: 0,Price,Close,High,Low,Open,Volume
0,Ticker,RSX,RSX,RSX,RSX,RSX
1,Date,,,,,
2,2020-08-21,18.217975616455078,18.23410522466204,18.03248973668729,18.0728122190006,4314134
3,2020-08-24,18.411527633666992,18.492174142338037,18.347010734370997,18.443786852417094,3035633
4,2020-08-25,18.274429321289062,18.35507583124388,18.15346032545896,18.35507583124388,4704345
...,...,...,...,...,...,...
1252,2025-08-14,5.61870002746582,5.61870002746582,5.61870002746582,5.61870002746582,0
1253,2025-08-15,5.61870002746582,5.61870002746582,5.61870002746582,5.61870002746582,0
1254,2025-08-18,5.61870002746582,5.61870002746582,5.61870002746582,5.61870002746582,0
1255,2025-08-19,5.61870002746582,5.61870002746582,5.61870002746582,5.61870002746582,0


In [58]:
stocks_shapes = {df.iloc[0, 1]: df.shape[0] for df in stocks_dfs}

In [60]:
pd.Series(stocks_shapes).value_counts()

1258    11
1257     2
240      1
283      1
245      1
1099     1
Name: count, dtype: int64

In [None]:
volume_dfs_shapes={df.}