In [14]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 120)
START = "2022-01-01"
END = "2025-12-18"

In [15]:
# --- Yahoo Finance via yfinance: US tickers (real market yh_df) ---
tickers = ["SPY", "QQQ", "TLT", "GLD", "EEM"]
# SPY : S&P 500 index
# QQQ : Nasdaq-100 index
# TLT : U.S. Treasury bonds with 20+ year maturity
# GLD : Physical gold prices
# EEM : MSCI Emerging Markets index

try:
    import yfinance as yf
except Exception as e:
    yf = None
    print("Could not import yfinance:", type(e).__name__, str(e))

if yf is not None:
    try:
        yh_df = yf.download(tickers, start=START, end=END, auto_adjust=True, progress=False)
    except Exception as e:
        yh_df = pd.DataFrame()
        print("yfinance download failed:", type(e).__name__, str(e))
else:
    yh_df = pd.DataFrame()

# Convert to long format: date, ticker, close, volume
if isinstance(yh_df, pd.DataFrame) and yh_df.shape[0] > 0:
    if isinstance(yh_df.columns, pd.MultiIndex):
        close = yh_df["Close"].copy()
        vol = yh_df["Volume"].copy()
    else:
        close = yh_df[["Close"]].rename(columns={"Close": tickers[0]})
        vol = yh_df[["Volume"]].rename(columns={"Volume": tickers[0]})

    close.index.name = "date"
    vol.index.name = "date"

    us_close_long = close.reset_index().melt(id_vars="date", var_name="ticker", value_name="close")
    us_vol_long = vol.reset_index().melt(id_vars="date", var_name="ticker", value_name="volume")
    us_mkt = us_close_long.merge(us_vol_long, on=["date","ticker"], how="inner").dropna(subset=["close"])
else:
    us_mkt = pd.DataFrame(columns=["date","ticker","close","volume"])

us_mkt.head(), us_mkt.shape

(        date ticker      close    volume
 0 2022-01-03    EEM  44.624966  27572700
 1 2022-01-04    EEM  44.470776  24579500
 2 2022-01-05    EEM  43.745163  46425100
 3 2022-01-06    EEM  43.944710  34288700
 4 2022-01-07    EEM  44.343792  32640900,
 (4970, 4))

### 3.2.2 From NumPy array to Series 

Using `us_mkt`:

1. Filter to `ticker == "SPY"`.
2. Take `close` as a NumPy array.
3. Create a Series indexed by `date` named `SPY_close_series`.
4. Compute the mean/min/max with Series methods.

In [16]:
# 3.2.2 From NumPy array to Series

# 1) Filtrar a SPY
spy_df = us_mkt.loc[us_mkt["ticker"].eq("SPY"), ["date", "close"]].copy()

# 2) Asegurar que esté ordenado por fecha (buena práctica)
spy_df = spy_df.sort_values("date")

# 3) Tomar close como NumPy array
spy_close_np = spy_df["close"].to_numpy()

# 4) Crear Series indexada por date, con nombre SPY_close_series
SPY_close_series = pd.Series(
    data=spy_close_np,
    index=spy_df["date"].to_numpy(),   # index por fechas
    name="SPY_close_series"
)

# 5) Estadísticos usando métodos de Series
spy_mean = SPY_close_series.mean()
spy_min  = SPY_close_series.min()
spy_max  = SPY_close_series.max()

SPY_close_series.head(), (spy_mean, spy_min, spy_max)


(2022-01-03    451.875153
 2022-01-04    451.723785
 2022-01-05    443.049744
 2022-01-06    442.633575
 2022-01-07    440.883575
 Name: SPY_close_series, dtype: float64,
 (485.6085836613922, 341.18206787109375, 687.1395263671875))

### 3.2.3 From Dictionary to Series 

Using `us_mkt`:

1. Compute the **last available close** for each ticker in `tickers`.
2. Store it in a dict `{ticker: last_close}`.
3. Convert to a Series and sort descending.

In [17]:
#1
df_sorted = us_mkt.sort_values(["ticker", "date"])
last_close_by_ticker = df_sorted.groupby("ticker")["close"].last()
last_close_by_ticker

ticker
EEM     52.599998
GLD    399.290009
QQQ    599.637390
SPY    669.421936
TLT     87.459633
Name: close, dtype: float64

In [18]:
#2
last_close_by_ticker_dict = last_close_by_ticker.to_dict()
last_close_by_ticker_dict

{'EEM': 52.599998474121094,
 'GLD': 399.2900085449219,
 'QQQ': 599.6373901367188,
 'SPY': 669.4219360351562,
 'TLT': 87.45963287353516}

In [19]:
#3
last_close_by_ticker_series = pd.Series(data=last_close_by_ticker_dict).sort_values(ascending=False)
last_close_by_ticker_series

SPY    669.421936
QQQ    599.637390
GLD    399.290009
TLT     87.459633
EEM     52.599998
dtype: float64

### 3.2.4 Series vs NumPy 

Goal: show why pandas alignment matters.

1. Create two Series indexed by date:
   - df mid-rate from `df`
   - SPY close from `us_mkt`
2. Combine them into a yh_dfFrame (pandas aligns on dates).
3. Separately, build two NumPy arrays by truncating to the same length.
4. In markdown: explain why alignment is safer.


In [20]:
# ---- Crear df (mid_rate) para poder hacer 3.2.4 ----
# Tomamos fechas de SPY y nos quedamos con 1 de cada 3 fechas para simular calendario distinto

spy_dates = (
    us_mkt.loc[us_mkt["ticker"].eq("SPY"), "date"]
    .drop_duplicates()
    .sort_values()
)

df = pd.DataFrame({
    "date": spy_dates.iloc[::3].to_numpy(),  # menos fechas que SPY
    "mid_rate": np.linspace(3.0, 5.0, num=len(spy_dates.iloc[::3]))  # mid_rate artificial
})

df.head(), df.shape


(        date  mid_rate
 0 2022-01-03  3.000000
 1 2022-01-06  3.006042
 2 2022-01-11  3.012085
 3 2022-01-14  3.018127
 4 2022-01-20  3.024169,
 (332, 2))

In [21]:
# 3.2.4 Series vs NumPy

# --- 1) Serie mid-rate desde df ---
# (buscamos un nombre típico de columna para el mid)
mid_col = pd.Index(["mid", "mid_rate", "mid-rate"]).intersection(df.columns)
if len(mid_col) == 0:
    raise KeyError("No encuentro columna mid/mid_rate/mid-rate en df. Revisa el nombre de la columna.")
mid_col = mid_col[0]

df_mid = df.loc[:, ["date", mid_col]].copy()
df_mid = df_mid.dropna(subset=["date"]).sort_values("date")

mid_series = pd.Series(
    data=df_mid[mid_col].to_numpy(),
    index=df_mid["date"].to_numpy(),
    name="mid_rate"
)

# --- 2) Serie SPY desde us_mkt ---
spy_df = us_mkt.loc[us_mkt["ticker"].eq("SPY"), ["date", "close"]].copy().sort_values("date")

spy_series = pd.Series(
    data=spy_df["close"].to_numpy(),
    index=spy_df["date"].to_numpy(),
    name="SPY_close"
)

# --- 3) Combinar en DataFrame: pandas alinea por índice (date) ---
aligned_df = pd.concat([mid_series, spy_series], axis=1)

# --- 4) Versión NumPy: recortar al mismo largo (NO alinea por fecha) ---
n = min(len(mid_series), len(spy_series))
mid_np = mid_series.to_numpy()[:n]
spy_np = spy_series.to_numpy()[:n]
np_side_by_side = np.column_stack([mid_np, spy_np])  # solo para comparar "a ciegas"

aligned_df.head(), aligned_df.shape, np_side_by_side[:5]


(            mid_rate   SPY_close
 2022-01-03  3.000000  451.875153
 2022-01-04       NaN  451.723785
 2022-01-05       NaN  443.049744
 2022-01-06  3.006042  442.633575
 2022-01-07       NaN  440.883575,
 (994, 2),
 array([[  3.        , 451.87515259],
        [  3.0060423 , 451.7237854 ],
        [  3.01208459, 443.04974365],
        [  3.01812689, 442.63357544],
        [  3.02416918, 440.88357544]]))

### 3.3.6 Dealing with Nulls 
Using `us_mkt`:

1. Copy `us_mkt` to `us_mkt_nan`.
2. Set 1% of `close` to NaN (fixed random seed).
3. Create:
   - `us_drop`: drop NaNs
   - `us_fill`: fill NaNs with ticker-specific median close
4. Compare shapes.

In [22]:
# 3.3.6 Dealing with Nulls

# 1) Copiar
us_mkt_nan = us_mkt.copy()

# 2) Setear 1% de close a NaN con seed fija
np.random.seed(42)
n_nan = int(np.floor(0.01 * len(us_mkt_nan)))  # 1% de filas
nan_idx = np.random.choice(us_mkt_nan.index.to_numpy(), size=n_nan, replace=False)
us_mkt_nan.loc[nan_idx, "close"] = np.nan

# 3a) Drop NaNs (solo donde close es NaN)
us_drop = us_mkt_nan.dropna(subset=["close"])

# 3b) Fill NaNs con mediana por ticker (ticker-specific median)
ticker_median_close = us_mkt_nan.groupby("ticker")["close"].transform("median")
us_fill = us_mkt_nan.copy()
us_fill["close"] = us_fill["close"].fillna(ticker_median_close)

# 4) Comparar shapes
(us_mkt.shape, us_mkt_nan.shape, us_drop.shape, us_fill.shape,
 us_mkt_nan["close"].isna().sum(), us_fill["close"].isna().sum())


((4970, 4), (4970, 4), (4921, 4), (4970, 4), 49, 0)

### 3.3.7 Duplicates 

1. Create `dup_df` by stacking the last 5 rows of `us_mkt` twice.
2. Detect duplicates using `.duplicated()`.
3. Remove them using `.drop_duplicates()`.

In [23]:
#1
last5 = us_mkt.tail(5)
dup_df = pd.concat([last5, last5], axis = 0, ignore_index = True)
dup_df

Unnamed: 0,date,ticker,close,volume
0,2025-12-11,TLT,87.848114,26778700
1,2025-12-12,TLT,87.001404,47030100
2,2025-12-15,TLT,87.06118,28611800
3,2025-12-16,TLT,87.539314,41018700
4,2025-12-17,TLT,87.459633,24668300
5,2025-12-11,TLT,87.848114,26778700
6,2025-12-12,TLT,87.001404,47030100
7,2025-12-15,TLT,87.06118,28611800
8,2025-12-16,TLT,87.539314,41018700
9,2025-12-17,TLT,87.459633,24668300


In [24]:
#2
detected_duplicates = dup_df.duplicated()
detected_duplicates

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [25]:
#3
drop_dup_df = dup_df.drop_duplicates()
drop_dup_df

Unnamed: 0,date,ticker,close,volume
0,2025-12-11,TLT,87.848114,26778700
1,2025-12-12,TLT,87.001404,47030100
2,2025-12-15,TLT,87.06118,28611800
3,2025-12-16,TLT,87.539314,41018700
4,2025-12-17,TLT,87.459633,24668300


### 3.3.8 Groupby 


Using `us_mkt`:

1. Group by `ticker` and compute:
   - mean close
   - median close
   - max volume
2. Rename columns clearly.
3. Sort by mean close descending.

In [26]:
#1
grp = (us_mkt.groupby("ticker").agg(
        mean_close=("close", "mean"),
        median_close=("close", "median"),
        max_volume=("volume", "max"),))
grp

Unnamed: 0_level_0,mean_close,median_close,max_volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EEM,40.46235,39.107409,134225700
GLD,220.130422,187.864998,62025000
QQQ,411.276966,400.214523,198685800
SPY,485.608584,460.739975,256611400
TLT,91.395622,88.549713,131353500


In [27]:
#2
grp_renamed = grp.rename(columns={
    'mean_close': 'mean close',
    'median_close': 'median close',
    'max_volume': 'highest volume'
})
grp_renamed

Unnamed: 0_level_0,mean close,median close,highest volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EEM,40.46235,39.107409,134225700
GLD,220.130422,187.864998,62025000
QQQ,411.276966,400.214523,198685800
SPY,485.608584,460.739975,256611400
TLT,91.395622,88.549713,131353500


In [28]:
#3
grp_renamed_sorted = grp_renamed.sort_values("mean close", ascending=False)
grp_renamed_sorted

Unnamed: 0_level_0,mean close,median close,highest volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SPY,485.608584,460.739975,256611400
QQQ,411.276966,400.214523,198685800
GLD,220.130422,187.864998,62025000
TLT,91.395622,88.549713,131353500
EEM,40.46235,39.107409,134225700


### 3.3.9 Reshape 

1. Create a 1-row wide yh_dfFrame with last closes per ticker.
2. Convert it to long format with `melt()` into columns: `ticker`, `last_close`.
3. Pivot `us_mkt` into a wide table: index=`date`, columns=`ticker`, values=`close` (keep first 50 dates).

In [29]:
#1
last_close_by_ticker

ticker
EEM     52.599998
GLD    399.290009
QQQ    599.637390
SPY    669.421936
TLT     87.459633
Name: close, dtype: float64

In [30]:
#1
yh_dfFrame = last_close_by_ticker.to_frame().T
yh_dfFrame

ticker,EEM,GLD,QQQ,SPY,TLT
close,52.599998,399.290009,599.63739,669.421936,87.459633


In [31]:
#2
yh_long = yh_dfFrame.melt(var_name="ticker", value_name="last_close")
yh_long

Unnamed: 0,ticker,last_close
0,EEM,52.599998
1,GLD,399.290009
2,QQQ,599.63739
3,SPY,669.421936
4,TLT,87.459633


In [32]:
#3
first_50_dates = np.sort(us_mkt["date"].unique())[:50]
us_mkt_first_50 = us_mkt[us_mkt["date"].isin(first_50_dates)]
us_mkt_wide = us_mkt_first_50.pivot(index="date", columns="ticker", values="close")
us_mkt_wide

ticker,EEM,GLD,QQQ,SPY,TLT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-03,44.624966,168.330002,391.679413,451.875153,125.295387
2022-01-04,44.470776,169.570007,386.599182,451.723785,124.774384
2022-01-05,43.745163,169.059998,374.722412,443.049744,124.097061
2022-01-06,43.94471,166.990005,374.459106,442.633575,124.418404
2022-01-07,44.343792,167.75,370.402679,440.883575,123.523964
2022-01-10,44.343792,168.259995,370.646423,440.3349,123.827965
2022-01-11,45.368713,170.289993,376.214355,444.345612,124.652786
2022-01-12,46.121532,170.740005,377.706238,445.546906,124.17527
2022-01-13,45.468487,170.160004,368.257507,439.407898,125.27803
2022-01-14,45.450348,169.669998,370.549042,439.587708,123.385109
