Collecting gymnasium
  Downloading gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.1.1-py3-none-any.whl (965 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [9]:
import gym
from gym import spaces
import numpy as np
import pandas as pd

class PairTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, data):
        """
        data: pandas DataFrame
            - 'spread'     : Log price spread
            - 'spread_MA'  : Moving Average
            - 'spread_STD' : STD of spre
            - 'Z_score'    : Z-score (spread - MA) / STD)
            - 'price'      : pair price
        """
        super(PairTradingEnv, self).__init__()
        
        self.data = data.reset_index(drop=True)
        self.n_steps = len(self.data)
        self.current_step = 0

        # action: 0-hold, 1-Long, 2-Short
        self.action_space = spaces.Discrete(3)
        
        # [spread, spread_MA, spread_STD, Z_score, price]
        low = -np.inf * np.ones(5)
        high = np.inf * np.ones(5)
        self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
        
        # Current Position: 0-hold, 1-Long, 2-Short
        self.position = 0
        self.entry_price = 0

    def reset(self):
        self.current_step = 0
        self.position = 0
        self.entry_price = 0
        return self._next_observation()
    
    def _next_observation(self):
        obs = self.data.iloc[self.current_step][['spread', 'spread_MA', 'spread_STD', 'Z_score', 'price']].values
        return obs.astype(np.float32)
    
    def step(self, action):
        """
        Process one step. 
        action: int, {0: hold, 1: Long, 2: Short}
        """
        done = False
        reward = 0.0
        info = {}
        
        # Move next step
        self.current_step += 1
        if self.current_step >= self.n_steps - 1:
            done = True
        
        current_price = self.data.iloc[self.current_step]['price']
        
        if action == 1:  # Long position.
            if self.position < 0:
                reward += (self.entry_price - current_price)  # Reward using short.
                self.position = 0

            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
        elif action == 2:  # Short Position.
            if self.position > 0:
                reward += (current_price - self.entry_price)  # Reward using Long.
                self.position = 0
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
        else:  # 0: Hold position
            if self.position != 0:
                if self.position == 1:
                    reward += (current_price - self.entry_price)
                else:
                    reward += (self.entry_price - current_price)
                self.position = 0
                self.entry_price = 0
        
        # Update State
        obs = self._next_observation()
        return obs, reward, done, info
    
    def render(self, mode='human', close=False):
        print(f"Step: {self.current_step}, Position: {self.position}, Price: {self.data.iloc[self.current_step]['price']}")

# Testing.
if __name__ == "__main__":
    np.random.seed(42)
    T = 200
    dummy_data = pd.DataFrame({
        'spread': np.random.normal(0, 1, T),
        'spread_MA': np.random.normal(0, 1, T),
        'spread_STD': np.abs(np.random.normal(0, 1, T)),
        'Z_score': np.random.normal(0, 1, T),
        'price': np.linspace(100, 120, T)  
    })
    
    env = PairTradingEnv(dummy_data)
    
    state = env.reset()
    print("Initial State:", state)
    
    for _ in range(5):
        action = env.action_space.sample() # Action and Policy. 
        state, reward, done, info = env.step(action)
        env.render()
        print("Reward:", reward, "\n")
        if done:
            break


초기 상태: [  0.49671414   0.35778737   1.5944277    0.75698864 100.        ]
Step: 1, Position: 1, Price: 100.10050251256281
Reward: 0.0 

Step: 2, Position: -1, Price: 100.20100502512562
Reward: 0.10050251256281229 

Step: 3, Position: 0, Price: 100.30150753768844
Reward: -0.10050251256281229 

Step: 4, Position: 0, Price: 100.40201005025126
Reward: 0.0 

Step: 5, Position: -1, Price: 100.50251256281408
Reward: 0.0 



In [13]:
raw = yf.download(["YUM", "MCD"], start="2020-01-01", progress=False)
print(raw.columns)   

MultiIndex([( 'Close', 'MCD'),
            ( 'Close', 'YUM'),
            (  'High', 'MCD'),
            (  'High', 'YUM'),
            (   'Low', 'MCD'),
            (   'Low', 'YUM'),
            (  'Open', 'MCD'),
            (  'Open', 'YUM'),
            ('Volume', 'MCD'),
            ('Volume', 'YUM')],
           names=['Price', 'Ticker'])


In [17]:
import yfinance as yf
import numpy as np
import pandas as pd

#  5-years
tickers   = ["YUM", "MCD"]
start_day = "2020-01-01"

raw = yf.download(tickers, start=start_day, progress=False)["Close"]

# log spread = ln(P_YUM) − ln(P_MCD)
log_price = np.log(raw)
spread    = log_price["YUM"] - log_price["MCD"]

# 3. Moving average, STD
win = 30
spread_MA   = spread.rolling(win).mean()
spread_STD  = spread.rolling(win).std(ddof=0)
Z_score     = (spread - spread_MA) / spread_STD

# 4. DataFrame
df = pd.DataFrame({
    "spread"    : spread, 
    "spread_MA" : spread_MA, # Recent Average Why?: Indicates the ‘normal (mean) position’ in a mean-reversion strategy. Simply using the overall historical mean reacts too slowly when the time series shifts, so we use a rolling mean instead.
    "spread_STD": spread_STD, # Volatility (σ) over the same window—how much does it fluctuate. Why?: Provides a scale reference to judge whether Fred’s ±5¢ move is ‘large’ or ‘small
    "Z_score"   : Z_score, # With thresholds like ±2σ, you can easily define Long/Short entry and exit rules.  A deep RL model can also instantly perceive the ‘normalized distance’ using only the Z_score.
    "price"     : spread # Log spread +: P_YUM is relatively more expensive than P_MCD. - P_YUM is relatively cheaper than P_MCD. price = 0.1 => e^0.1 = 1.105 110% more expensive 
}).dropna()          # Remove NAN.

In [23]:
print(df)
print(raw)

              spread  spread_MA  spread_STD   Z_score     price
Date                                                           
2020-02-13 -0.699376  -0.682026    0.015843 -1.095174 -0.699376
2020-02-14 -0.695480  -0.683479    0.014988 -0.800715 -0.695480
2020-02-18 -0.703085  -0.685199    0.014154 -1.263594 -0.703085
2020-02-19 -0.689587  -0.686077    0.013571 -0.258654 -0.689587
2020-02-20 -0.699661  -0.687300    0.013077 -0.945223 -0.699661
...              ...        ...         ...       ...       ...
2025-04-21 -0.787156  -0.703192    0.042818 -1.960943 -0.787156
2025-04-22 -0.788363  -0.706253    0.045435 -1.807192 -0.788363
2025-04-23 -0.778858  -0.709824    0.046772 -1.475970 -0.778858
2025-04-24 -0.762068  -0.713152    0.046819 -1.044789 -0.762068
2025-04-25 -0.765610  -0.716598    0.046749 -1.048417 -0.765610

[1307 rows x 5 columns]
Ticker             MCD         YUM
Date                              
2020-01-02  177.814438   92.652519
2020-01-03  177.185684   92.362297
202

In [31]:
from gymnasium import make

# Class pairTradingEnv using dataFrame = df
env = PairTradingEnv(df)

state = env.reset()
print("Initial State:", state)

done = False
total_reward = 0
while not done:
    action = env.action_space.sample()        # test Random action without policy.
    state, reward, done, _ = env.step(action)
    total_reward += reward
    env.render()

print("Total Reward:", total_reward)


Initial State: [-0.69937634 -0.68202597  0.0158426  -1.0951743  -0.69937634]
Step: 1, Position: 0, Price: -0.6954798793314554
Step: 2, Position: 1, Price: -0.703084717243109
Step: 3, Position: -1, Price: -0.6895872665742564
Step: 4, Position: -1, Price: -0.6996611510495638
Step: 5, Position: 0, Price: -0.7067992529965865
Step: 6, Position: 0, Price: -0.7278549152346905
Step: 7, Position: 0, Price: -0.7443924692543913
Step: 8, Position: -1, Price: -0.7395225910206484
Step: 9, Position: -1, Price: -0.746051417214975
Step: 10, Position: 0, Price: -0.755322385569924
Step: 11, Position: 0, Price: -0.7614836645232304
Step: 12, Position: 0, Price: -0.7596335751390102
Step: 13, Position: 0, Price: -0.7608707947180307
Step: 14, Position: -1, Price: -0.7579317935683978
Step: 15, Position: -1, Price: -0.7775102541384289
Step: 16, Position: -1, Price: -0.7811507002162434
Step: 17, Position: 0, Price: -0.8122709395994274
Step: 18, Position: 1, Price: -0.8217217295871047
Step: 19, Position: 0, Price