In [3]:
import numpy as np
import pandas as pd

# cleaning

In [4]:
data = pd.read_csv('Dow Jones Industrial Average Historical Data.csv')
df = pd.DataFrame(data)

In [5]:
df.head(5)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,11/28/2025,47716.42,47482.25,47750.77,47475.61,272.51M,0.61%
1,11/26/2025,47427.12,47196.15,47571.4,47196.15,458.14M,0.67%
2,11/25/2025,47112.45,46482.36,47182.9,46341.35,659.61M,1.43%
3,11/24/2025,46448.27,46351.93,46587.71,46108.01,748.69M,0.44%
4,11/21/2025,46245.41,45808.65,46577.5,45781.58,795.91M,1.08%


In [6]:
# convert the data from string to float
for col in ['Price','Open', 'High', 'Low']:
    df[col] = df[col].astype(str).str.replace(',', '').astype(float)
df['Change %'] = df['Change %'].astype(str).str.replace('%','').astype(float)

In [7]:
lst = []
for num in df['Vol.']:
    num = str(num)
    if 'B' in num:
        val = float(num.replace('B','')) * 1e9
        lst.append(val)
    elif 'M' in num:
        val = float(num.replace('M','')) * 1e6
        lst.append(val)
    else:
        lst.append(float(num))

df['Vol.'] = lst

In [8]:
df['Date'] = pd.to_datetime(df['Date'])

In [9]:
# sort date
df = df.sort_values(by='Date').reset_index(drop=True)


In [10]:
df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2024-01-02,37715.04,37566.22,37790.08,37495.91,350290000.0,0.07
1,2024-01-03,37430.19,37629.23,37629.23,37401.85,329140000.0,-0.76
2,2024-01-04,37440.34,37425.28,37716.41,37425.28,380220000.0,0.03
3,2024-01-05,37466.11,37455.46,37623.62,37323.82,299490000.0,0.07
4,2024-01-08,37683.01,37327.37,37692.92,37249.24,362200000.0,0.58
...,...,...,...,...,...,...,...
476,2025-11-21,46245.41,45808.65,46577.50,45781.58,795910000.0,1.08
477,2025-11-24,46448.27,46351.93,46587.71,46108.01,748690000.0,0.44
478,2025-11-25,47112.45,46482.36,47182.90,46341.35,659610000.0,1.43
479,2025-11-26,47427.12,47196.15,47571.40,47196.15,458140000.0,0.67


##  detect the states firstly


In [11]:
df['returns'] = df['Price'].pct_change() # to detect the Bull ,Bear market
df['range'] = (df['High'] - df['Low']) / df['Open'] # to detect the volatility of the stock
df['vol_chg'] = df['Vol.'].pct_change() 

  df['vol_chg'] = df['Vol.'].pct_change()


In [12]:
df['returns']

0           NaN
1     -0.007553
2      0.000271
3      0.000688
4      0.005789
         ...   
476    0.010779
477    0.004387
478    0.014299
479    0.006679
480    0.006100
Name: returns, Length: 481, dtype: float64

In [13]:
df.isna().sum()

Date        0
Price       0
Open        0
High        0
Low         0
Vol.        1
Change %    0
returns     1
range       0
vol_chg     1
dtype: int64

### explain the function of pct_change 
#### to measure the diff between 2 rows (current value - pervious value ) / pervious value

In [14]:
df['vol_chg'] = df['vol_chg'].fillna(0)
df['Vol.'] = df['Vol.'].fillna(0)
df['returns'] = df['returns'].fillna(0)

In [15]:
# --- INSERT THIS CELL AFTER DATA CLEANING ---

# 1. Discretize 'returns' into 3 buckets (0, 1, 2)
# q=3 means we split data into 3 equal quantiles:
# 0 = Low/Negative Returns
# 1 = Flat/Medium Returns
# 2 = High/Positive Returns
df['obs'] = pd.qcut(df['returns'], q=3, labels=[0, 1, 2]).astype(int)

# 2. Convert to a list to use as the observation sequence
obs_seq = df['obs'].tolist()

print("Observation sequence created. First 10 observations:", obs_seq[:10])

Observation sequence created. First 10 observations: [1, 0, 1, 1, 2, 0, 2, 1, 0, 0]


In [16]:
df.isna().sum()

Date        0
Price       0
Open        0
High        0
Low         0
Vol.        0
Change %    0
returns     0
range       0
vol_chg     0
obs         0
dtype: int64

## steps for apply Baum witch algorithm

In [17]:
df.shape

(481, 11)

In [18]:
x = np.random.rand(3,3)
x

array([[0.76720877, 0.91489833, 0.55759692],
       [0.27191474, 0.6674527 , 0.79304765],
       [0.83902394, 0.0822601 , 0.36426235]])

In [19]:
x.sum(axis=1, keepdims=True)

array([[2.23970402],
       [1.73241508],
       [1.28554639]])

In [20]:
x[x == 0] = 1  

In [21]:
x

array([[0.76720877, 0.91489833, 0.55759692],
       [0.27191474, 0.6674527 , 0.79304765],
       [0.83902394, 0.0822601 , 0.36426235]])

In [22]:
np.random.seed(42)

def transition_matrix(numstat):
    p = np.random.rand(numstat, numstat)
    row_sums = p.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  
    p /= row_sums
    return p

def emission_matrix(numstat, numobs):
    e = np.random.rand(numstat, numobs)
    row_sums = e.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    e /= row_sums
    return e

def initial_distribution(numstat):
    Pi = np.random.rand(numstat)
    s = Pi.sum()
    if s == 0:
        s = 1
    Pi /= s
    return Pi

In [23]:
numstat = 3
numobs = 3  
p = transition_matrix(numstat)
e = emission_matrix(numstat,numobs)
pi = initial_distribution(numstat)

In [24]:
def update(gamma, xi, obs_seq, N, M):
    pi_new = gamma[0]
    p_new = xi.sum(axis=0) / gamma[:-1].sum(axis=0)[:, None]
    e_new = np.zeros((N, M))
    for k in range(M):
        mask = np.array(obs_seq) == k
        e_new[:, k] = gamma[mask].sum(axis=0)
    e_new /= gamma.sum(axis=0)[:, None]
    return p_new, e_new, pi_new

In [25]:
def baum_welch(obs_seq, N, M, max_iter=100, tol=1e-6):
    global p, e, pi
    T = len(obs_seq)

    ## forward algorithm
    for iteration in range(max_iter):
        alpha = np.zeros((T, N))
        alpha[0] = pi * e[:, obs_seq[0]] # intilization for alpha as alpha 1 for the next calculation
        for t in range(1, T):   # T  time 
            for j in range(N): # N number of states
                alpha[t, j] = e[j, obs_seq[t]] * np.sum(alpha[t-1] * p[:, j])
        alpha_sum = alpha.sum(axis=1, keepdims=True)
        alpha /= alpha_sum
        

        ### backward algorithm
        beta = np.zeros((T, N))
        beta[-1] = 1
        for t in range(T-2, -1, -1):
            for i in range(N):
                beta[t, i] = np.sum(p[i] * e[:, obs_seq[t+1]] * beta[t+1])
        beta /= alpha_sum



        # Baum welch
        gamma = alpha * beta
        gamma /= gamma.sum(axis=1, keepdims=True)

        xi = np.zeros((T-1, N, N))
        for t in range(T-1):
            denom = np.sum(alpha[t][:, None] * p * e[:, obs_seq[t+1]] * beta[t+1])
            xi[t] = (alpha[t][:, None] * p * e[:, obs_seq[t+1]] * beta[t+1]) / denom

        p_new, e_new, pi_new = update(gamma, xi, obs_seq, N, M)

        if (np.max(np.abs(p - p_new)) < tol and
            np.max(np.abs(e - e_new)) < tol and
            np.max(np.abs(pi - pi_new)) < tol):
            break

        p, e, pi = p_new, e_new, pi_new

    return p, e, pi


In [27]:
# Run Baum-Welch
# This will update the global p, e, and pi variables based on your code structure
p_final, e_final, pi_final = baum_welch(obs_seq, N, M, max_iter=100)

# Display Results
print("-" * 30)
print("Final Transition Matrix (P):\n", np.round(p_final, 4))
print("\nFinal Emission Matrix (E):\n", np.round(e_final, 4))
print("\nFinal Initial Distribution (Pi):\n", np.round(pi_final, 4))
print("-" * 30)

------------------------------
Final Transition Matrix (P):
 [[0.2601 0.3671 0.3728]
 [0.6471 0.0571 0.2958]
 [0.058  0.601  0.341 ]]

Final Emission Matrix (E):
 [[0.3593 0.0318 0.6089]
 [0.536  0.1171 0.3469]
 [0.1021 0.8378 0.0601]]

Final Initial Distribution (Pi):
 [0. 0. 1.]
------------------------------
