In [None]:
import polars as pl
import pandas as pd

from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()

# Get data from yahoo

In [None]:
ticker = 'AMD'
df = yf.download(ticker, interval = '1d', start='2010-01-01')

# Save to disk

In [None]:
df.to_csv(f'data/daily/{ticker}.csv')

In [None]:
len(df)

## The csv file is 375k for the dates up to August 4, 2023

In [None]:
!dir .\\data\\daily

## Instead, save as parquet

In [None]:
df.to_parquet(f'data/daily/{ticker}.parquet')

In [None]:
!dir .\\data\\daily

# Let's switch to polars and clean up the columns

In some instances, yf can return a fload for the volume. This can be a problem if you intend to search multiple files (as we will) and in one file, the volume is a float while another file is an int.  Also, searching multiple files is easier if we add the symbol as a column.

In [None]:
df_pl = (pl.from_pandas(df.reset_index())
    # add a column for the symbol
    # cast the volume column to a type int64
    .with_columns([
        pl.lit(ticker).alias("Symbol"),
        pl.col("Volume").cast(pl.Int64),
        pl.col("Date").cast(pl.Date)
    ])
)

In [None]:
df_pl.head(5)

In [None]:
df_pl.write_parquet(f'data/daily/{ticker}.parquet')

In [None]:
!dir .\\data\\daily

## Let's do it again but in 1 line

In [None]:
ticker = 'TLT'
(pl.from_pandas(yf.download(ticker, interval = '1d', start='2010-01-01', progress=False).reset_index())
    # add a column for the symbol
    # cast the volume column to a type int64
    .with_columns([
        pl.lit(ticker).alias("Symbol"),
        pl.col("Volume").cast(pl.Int64),
        pl.col("Date").cast(pl.Date)
    ])
    .write_parquet(f'data/daily/{ticker}.parquet')
)

In [None]:
!dir .\\data\\daily

In [None]:
(pl.scan_parquet('data/daily/*.parquet')
    .groupby(['Symbol'])
    .agg(
        pl.col("Date").min().alias("First Date"),
        pl.col("Date").max().alias("Last Date")
    )
    .collect()
)

In [None]:
ticker = 'SNOW'
(pl.from_pandas(yf.download(ticker, interval = '1d', start='2010-01-01', progress=False).reset_index())
    # add a column for the symbol
    # cast the volume column to a type int64
    .with_columns([
        pl.lit(ticker).alias("Symbol"),
        pl.col("Volume").cast(pl.Int64),
        pl.col("Date").cast(pl.Date)
    ])
    .write_parquet(f'data/daily/{ticker}.parquet')
)

In [None]:
(pl.scan_parquet('data/daily/*.parquet')
    .groupby(['Symbol'])
    .agg(
        pl.col("Date").min().alias("First Date"),
        pl.col("Date").max().alias("Last Date")
    )
    .collect()
)