In [76]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 120)
START = "2022-01-01"
END = "2025-12-18"

In [77]:
# --- Yahoo Finance via yfinance: US tickers (real market yh_df) ---
tickers = ["SPY", "QQQ", "TLT", "GLD", "EEM"]
# SPY : S&P 500 index
# QQQ : Nasdaq-100 index
# TLT : U.S. Treasury bonds with 20+ year maturity
# GLD : Physical gold prices
# EEM : MSCI Emerging Markets index

try:
    import yfinance as yf
except Exception as e:
    yf = None
    print("Could not import yfinance:", type(e).__name__, str(e))

if yf is not None:
    try:
        yh_df = yf.download(tickers, start=START, end=END, auto_adjust=True, progress=False)
    except Exception as e:
        yh_df = pd.DataFrame()
        print("yfinance download failed:", type(e).__name__, str(e))
else:
    yh_df = pd.DataFrame()

# Convert to long format: date, ticker, close, volume
if isinstance(yh_df, pd.DataFrame) and yh_df.shape[0] > 0:
    if isinstance(yh_df.columns, pd.MultiIndex):
        close = yh_df["Close"].copy()
        vol = yh_df["Volume"].copy()
    else:
        close = yh_df[["Close"]].rename(columns={"Close": tickers[0]})
        vol = yh_df[["Volume"]].rename(columns={"Volume": tickers[0]})

    close.index.name = "date"
    vol.index.name = "date"

    us_close_long = close.reset_index().melt(id_vars="date", var_name="ticker", value_name="close")
    us_vol_long = vol.reset_index().melt(id_vars="date", var_name="ticker", value_name="volume")
    us_mkt = us_close_long.merge(us_vol_long, on=["date","ticker"], how="inner").dropna(subset=["close"])
else:
    us_mkt = pd.DataFrame(columns=["date","ticker","close","volume"])

us_mkt.head(), us_mkt.shape

Could not import yfinance: ModuleNotFoundError No module named 'yfinance'


(Empty DataFrame
 Columns: [date, ticker, close, volume]
 Index: [],
 (0, 4))

### 3.2.2 From NumPy array to Series 

Using `us_mkt`:

1. Filter to `ticker == "SPY"`.
2. Take `close` as a NumPy array.
3. Create a Series indexed by `date` named `SPY_close_series`.
4. Compute the mean/min/max with Series methods.

In [78]:
# 1) Filter to ticker == "SPY"
spy_df = us_mkt[us_mkt["ticker"] == "SPY"].copy()
spy_df["date"] = pd.to_datetime(spy_df["date"])
spy_df = spy_df.sort_values("date") #just to make sure is in order
print(spy_df)

Empty DataFrame
Columns: [date, ticker, close, volume]
Index: []


In [79]:
# 2) Take close as a NumPy array
spy_close_np = spy_df["close"].to_numpy()
print(spy_close_np)

[]


In [80]:
# 3) Create a Series indexed by date named SPY_close_series
SPY_close_series = pd.Series(spy_close_np, index=spy_df["date"], name="SPY_close_series")
print(SPY_close_series)

Series([], Name: SPY_close_series, dtype: object)


In [81]:
# 4) Compute the mean/min/max with Series methods
SPY_close_series.mean(), SPY_close_series.min(), SPY_close_series.max()

(nan, nan, nan)

### 3.2.3 From Dictionary to Series 

Using `us_mkt`:

1. Compute the **last available close** for each ticker in `tickers`.
2. Store it in a dict `{ticker: last_close}`.
3. Convert to a Series and sort descending.

In [82]:
# 3.2.3 From Dictionary to Series

# make sure date is datetime 
us_mkt_sorted = us_mkt.copy()
us_mkt_sorted["date"] = pd.to_datetime(us_mkt_sorted["date"])
us_mkt_sorted = us_mkt_sorted.sort_values("date")
# 1) Compute the last available close for each ticker in tickers
last_close_series = us_mkt_sorted.groupby("ticker")["close"].last().reindex(tickers)
print(last_close_series)

ticker
SPY    NaN
QQQ    NaN
TLT    NaN
GLD    NaN
EEM    NaN
Name: close, dtype: object


In [83]:
# 2) Store it in a dict {ticker: last_close}
last_close_dict = last_close_series.to_dict()
print(last_close_dict)

{'SPY': nan, 'QQQ': nan, 'TLT': nan, 'GLD': nan, 'EEM': nan}


In [84]:
# 3) Convert to a Series and sort descending
pd.Series(last_close_dict).sort_values(ascending=False)

SPY   NaN
QQQ   NaN
TLT   NaN
GLD   NaN
EEM   NaN
dtype: float64

### 3.2.4 Series vs NumPy 

Goal: show why pandas alignment matters.

1. Create two Series indexed by date:
   - df mid-rate from `df`
   - SPY close from `us_mkt`
2. Combine them into a yh_dfFrame (pandas aligns on dates).
3. Separately, build two NumPy arrays by truncating to the same length.
4. In markdown: explain why alignment is safer.


### 3.3.6 Dealing with Nulls 
Using `us_mkt`:

1. Copy `us_mkt` to `us_mkt_nan`.
2. Set 1% of `close` to NaN (fixed random seed).
3. Create:
   - `us_drop`: drop NaNs
   - `us_fill`: fill NaNs with ticker-specific median close
4. Compare shapes.

### 3.3.7 Duplicates 

1. Create `dup_df` by stacking the last 5 rows of `us_mkt` twice.
2. Detect duplicates using `.duplicated()`.
3. Remove them using `.drop_duplicates()`.

In [85]:
# 1. Create dup_df by stacking the last 5 rows of us_mkt twice
dup_df = pd.concat([us_mkt.tail(5), us_mkt.tail(5)], ignore_index=True)
dup_df


Unnamed: 0,date,ticker,close,volume


In [86]:
# 2. Detect duplicates
dup_mask = dup_df.duplicated()
print("Number of duplicated rows:", dup_mask.sum())
dup_df[dup_mask]


Number of duplicated rows: 0


Unnamed: 0,date,ticker,close,volume


In [87]:
# 3. Remove duplicates
dup_df_clean = dup_df.drop_duplicates()
print("Original shape:", dup_df.shape)
print("Clean shape:", dup_df_clean.shape)
dup_df_clean


Original shape: (0, 4)
Clean shape: (0, 4)


Unnamed: 0,date,ticker,close,volume


### 3.3.8 Groupby 


Using `us_mkt`:

1. Group by `ticker` and compute:
   - mean close
   - median close
   - max volume
2. Rename columns clearly.
3. Sort by mean close descending.

In [88]:
# 1. Group by ticker and compute statistics
grp = (
    us_mkt.groupby("ticker")
    .agg(
        mean_close=("close", "mean"),
        median_close=("close", "median"),
        max_volume=("volume", "max")
    )
    .reset_index()
)

grp


Unnamed: 0,ticker,mean_close,median_close,max_volume


In [89]:
# 3. Sort by mean close descending
grp_sorted = grp.sort_values("mean_close", ascending=False)
grp_sorted


Unnamed: 0,ticker,mean_close,median_close,max_volume


### 3.3.9 Reshape 

1. Create a 1-row wide yh_dfFrame with last closes per ticker.
2. Convert it to long format with `melt()` into columns: `ticker`, `last_close`.
3. Pivot `us_mkt` into a wide table: index=`date`, columns=`ticker`, values=`close` (keep first 50 dates).

In [90]:
#Solutions for 3.3.9
# Create a 1-row wide DataFrame with the last close for each ticker
last_closes = (
    us_mkt
    .sort_values("date") # Sort data by date to make sure we keep the most recent observation
    .groupby("ticker", as_index=False) # Group by ticker and take the last available close price
    .last()[["ticker", "close"]] # Keep only ticker and close columns
    .set_index("ticker") 
    .T # Transpose the DataFrame to get a single-row wide format
)
last_closes

ticker
close


In [91]:
# Convert the wide DataFrame to long format
last_closes_long = last_closes.melt( # Convert the wide DataFrame into long format
    var_name="ticker",
    value_name="last_close" # Each row represents the last close of one ticker
)
last_closes_long 

Unnamed: 0,ticker,last_close


In [92]:
# Pivot us_mkt into a wide table (date x ticker) keeping first 50 dates
us_mkt_wide = (
    us_mkt
    .pivot(index="date", columns="ticker", values="close") # Pivot the market data to create a wide table,rows are dates and columns are tickers
    .sort_index() # Sort dates in ascending order
    .head(50) # Keep only the first 50 available dates
)
us_mkt_wide

ticker
date
