In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 120)
START = "2022-01-01"
END = "2025-12-18"

In [2]:
# --- Yahoo Finance via yfinance: US tickers (real market yh_df) ---
tickers = ["SPY", "QQQ", "TLT", "GLD", "EEM"]
# SPY : S&P 500 index
# QQQ : Nasdaq-100 index
# TLT : U.S. Treasury bonds with 20+ year maturity
# GLD : Physical gold prices
# EEM : MSCI Emerging Markets index

try:
    import yfinance as yf
except Exception as e:
    yf = None
    print("Could not import yfinance:", type(e).__name__, str(e))

if yf is not None:
    try:
        yh_df = yf.download(tickers, start=START, end=END, auto_adjust=True, progress=False)
    except Exception as e:
        yh_df = pd.DataFrame()
        print("yfinance download failed:", type(e).__name__, str(e))
else:
    yh_df = pd.DataFrame()

# Convert to long format: date, ticker, close, volume
if isinstance(yh_df, pd.DataFrame) and yh_df.shape[0] > 0:
    if isinstance(yh_df.columns, pd.MultiIndex):
        close = yh_df["Close"].copy()
        vol = yh_df["Volume"].copy()
    else:
        close = yh_df[["Close"]].rename(columns={"Close": tickers[0]})
        vol = yh_df[["Volume"]].rename(columns={"Volume": tickers[0]})

    close.index.name = "date"
    vol.index.name = "date"

    us_close_long = close.reset_index().melt(id_vars="date", var_name="ticker", value_name="close")
    us_vol_long = vol.reset_index().melt(id_vars="date", var_name="ticker", value_name="volume")
    us_mkt = us_close_long.merge(us_vol_long, on=["date","ticker"], how="inner").dropna(subset=["close"])
else:
    us_mkt = pd.DataFrame(columns=["date","ticker","close","volume"])

us_mkt.head(), us_mkt.shape

(        date ticker      close    volume
 0 2022-01-03    EEM  44.624966  27572700
 1 2022-01-04    EEM  44.470776  24579500
 2 2022-01-05    EEM  43.745163  46425100
 3 2022-01-06    EEM  43.944710  34288700
 4 2022-01-07    EEM  44.343792  32640900,
 (4970, 4))

### 3.2.2 From NumPy array to Series 

Using `us_mkt`:

1. Filter to `ticker == "SPY"`.
2. Take `close` as a NumPy array.
3. Create a Series indexed by `date` named `SPY_close_series`.
4. Compute the mean/min/max with Series methods.

In [11]:
# 1) Filter to ticker == "SPY"
spy_df = us_mkt[us_mkt["ticker"] == "SPY"].copy()
spy_df["date"] = pd.to_datetime(spy_df["date"])
spy_df = spy_df.sort_values("date") #just to make sure is in order
print(spy_df)

           date ticker       close     volume
2982 2022-01-03    SPY  451.875122   72668200
2983 2022-01-04    SPY  451.723785   71178700
2984 2022-01-05    SPY  443.049774  104538900
2985 2022-01-06    SPY  442.633545   86858900
2986 2022-01-07    SPY  440.883575   85111600
...         ...    ...         ...        ...
3971 2025-12-11    SPY  687.139526   86173700
3972 2025-12-12    SPY  679.751404  113160300
3973 2025-12-15    SPY  678.724426   90811000
3974 2025-12-16    SPY  676.869934  122030600
3975 2025-12-17    SPY  669.421936  110625200

[994 rows x 4 columns]


In [7]:
# 2) Take close as a NumPy array
spy_close_np = spy_df["close"].to_numpy()
print(spy_close_np)

[451.87512207 451.7237854  443.04977417 442.63354492 440.88357544
 440.33499146 444.34564209 445.54696655 439.40795898 439.58764648
 431.80270386 427.31903076 422.58950806 414.29379272 416.05316162
 410.97357178 409.94256592 407.91827393 418.04907227 425.57858276
 428.45422363 432.61624146 422.44769287 424.43414307 423.07196045
 426.55288696 432.79602051 425.02041626 416.63961792 415.27758789
 421.97460938 422.44769287 413.42355347 410.74655151 406.33853149
 399.13070679 405.13726807 414.07620239 413.01681519 406.72637939
 414.2086792  412.14651489 408.79797363 396.74694824 393.73898315
 404.29547119 402.4697876  397.35235596 394.44839478 403.12249756
 412.06137085 417.21670532 421.7862854  421.6630249  426.59713745
 421.10314941 427.45101929 429.5385437  432.59384155 437.94546509
 435.24118042 428.54223633 429.75680542 433.43832397 427.96343994
 423.68405151 425.81900024 424.68041992 417.42163086 415.87496948
 420.63827515 415.40054321 415.57131958 422.27975464 421.9666748
 415.656738

In [9]:
# 3) Create a Series indexed by date named SPY_close_series
SPY_close_series = pd.Series(spy_close_np, index=spy_df["date"], name="SPY_close_series")
print(SPY_close_series)

date
2022-01-03    451.875122
2022-01-04    451.723785
2022-01-05    443.049774
2022-01-06    442.633545
2022-01-07    440.883575
                 ...    
2025-12-11    687.139526
2025-12-12    679.751404
2025-12-15    678.724426
2025-12-16    676.869934
2025-12-17    669.421936
Name: SPY_close_series, Length: 994, dtype: float64


In [10]:
# 4) Compute the mean/min/max with Series methods
SPY_close_series.mean(), SPY_close_series.min(), SPY_close_series.max()

(np.float64(485.6085843061297),
 np.float64(341.1820068359375),
 np.float64(687.1395263671875))

### 3.2.3 From Dictionary to Series 

Using `us_mkt`:

1. Compute the **last available close** for each ticker in `tickers`.
2. Store it in a dict `{ticker: last_close}`.
3. Convert to a Series and sort descending.

In [13]:
# 3.2.3 From Dictionary to Series

# make sure date is datetime 
us_mkt_sorted = us_mkt.copy()
us_mkt_sorted["date"] = pd.to_datetime(us_mkt_sorted["date"])
us_mkt_sorted = us_mkt_sorted.sort_values("date")
# 1) Compute the last available close for each ticker in tickers
last_close_series = us_mkt_sorted.groupby("ticker")["close"].last().reindex(tickers)
print(last_close_series)

ticker
SPY    669.421936
QQQ    599.637390
TLT     87.459633
GLD    399.290009
EEM     52.599998
Name: close, dtype: float64


In [15]:
# 2) Store it in a dict {ticker: last_close}
last_close_dict = last_close_series.to_dict()
print(last_close_dict)

{'SPY': 669.4219360351562, 'QQQ': 599.6373901367188, 'TLT': 87.45963287353516, 'GLD': 399.2900085449219, 'EEM': 52.599998474121094}


In [16]:
# 3) Convert to a Series and sort descending
pd.Series(last_close_dict).sort_values(ascending=False)

SPY    669.421936
QQQ    599.637390
GLD    399.290009
TLT     87.459633
EEM     52.599998
dtype: float64

### 3.2.4 Series vs NumPy 

Goal: show why pandas alignment matters.

1. Create two Series indexed by date:
   - df mid-rate from `df`
   - SPY close from `us_mkt`
2. Combine them into a yh_dfFrame (pandas aligns on dates).
3. Separately, build two NumPy arrays by truncating to the same length.
4. In markdown: explain why alignment is safer.


### 3.3.6 Dealing with Nulls 
Using `us_mkt`:

1. Copy `us_mkt` to `us_mkt_nan`.
2. Set 1% of `close` to NaN (fixed random seed).
3. Create:
   - `us_drop`: drop NaNs
   - `us_fill`: fill NaNs with ticker-specific median close
4. Compare shapes.

### 3.3.7 Duplicates 

1. Create `dup_df` by stacking the last 5 rows of `us_mkt` twice.
2. Detect duplicates using `.duplicated()`.
3. Remove them using `.drop_duplicates()`.

### 3.3.8 Groupby 


Using `us_mkt`:

1. Group by `ticker` and compute:
   - mean close
   - median close
   - max volume
2. Rename columns clearly.
3. Sort by mean close descending.

### 3.3.9 Reshape 

1. Create a 1-row wide yh_dfFrame with last closes per ticker.
2. Convert it to long format with `melt()` into columns: `ticker`, `last_close`.
3. Pivot `us_mkt` into a wide table: index=`date`, columns=`ticker`, values=`close` (keep first 50 dates).

In [13]:
#Solutions for 3.3.9
# Create a 1-row wide DataFrame with the last close for each ticker
last_closes = (
    us_mkt
    .sort_values("date") # Sort data by date to make sure we keep the most recent observation
    .groupby("ticker", as_index=False) # Group by ticker and take the last available close price
    .last()[["ticker", "close"]] # Keep only ticker and close columns
    .set_index("ticker") 
    .T # Transpose the DataFrame to get a single-row wide format
)
last_closes

ticker,EEM,GLD,QQQ,SPY,TLT
close,52.599998,399.290009,599.63739,669.421936,87.459633


In [14]:
# Convert the wide DataFrame to long format
last_closes_long = last_closes.melt( # Convert the wide DataFrame into long format
    var_name="ticker",
    value_name="last_close" # Each row represents the last close of one ticker
)
last_closes_long 

Unnamed: 0,ticker,last_close
0,EEM,52.599998
1,GLD,399.290009
2,QQQ,599.63739
3,SPY,669.421936
4,TLT,87.459633


In [16]:
# Pivot us_mkt into a wide table (date x ticker) keeping first 50 dates
us_mkt_wide = (
    us_mkt
    .pivot(index="date", columns="ticker", values="close") # Pivot the market data to create a wide table,rows are dates and columns are tickers
    .sort_index() # Sort dates in ascending order
    .head(50) # Keep only the first 50 available dates
)
us_mkt_wide

ticker,EEM,GLD,QQQ,SPY,TLT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-03,44.624966,168.330002,391.679443,451.875183,125.295334
2022-01-04,44.470772,169.570007,386.599182,451.723846,124.774353
2022-01-05,43.745167,169.059998,374.722412,443.049744,124.097084
2022-01-06,43.944706,166.990005,374.459137,442.633514,124.418388
2022-01-07,44.343792,167.75,370.402649,440.883545,123.523994
2022-01-10,44.343792,168.259995,370.646423,440.33493,123.827965
2022-01-11,45.368717,170.289993,376.214294,444.345612,124.652779
2022-01-12,46.121532,170.740005,377.706238,445.546906,124.175262
2022-01-13,45.468487,170.160004,368.257446,439.407959,125.277977
2022-01-14,45.450352,169.669998,370.548981,439.587738,123.385139
