In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
data = pd.read_csv('./clean_data/labels.csv', index_col=0)

In [30]:
class BayesianLabelPredictor:
    def __init__(self, dataset, smoothing=1):
        """
        Initialize the class with a dataset of daily labels and optional smoothing.
        :param dataset: A list or numpy array of daily labels (integers or categorical)
        :param smoothing: Smoothing factor for Laplace smoothing (default = 1)
        """
        self.dataset = np.array(dataset)
        self.unique_labels = np.unique(self.dataset)
        self.n = len(self.unique_labels)
        self.smoothing = smoothing
        self.today_label = self.dataset[-1]
        
        # Initialize the prior matrix n x n (today's label on rows, tomorrow's on columns) with smoothing
        self.prior_matrix = np.zeros((self.n, self.n))
        
        # Calculate the prior matrix from the dataset
        self.calculate_prior()
    
    def calculate_prior(self):
        """
        Calculate the initial prior matrix (conditional probability matrix) from the dataset.
        """
        # Use Laplace smoothing
        self.prior_matrix = np.ones((self.n, self.n)) * self.smoothing
        
        # Go through each pair of consecutive labels (today, tomorrow) to compute probabilities
        for i in range(len(self.dataset) - 1):
            today_label = self.dataset[i]
            tomorrow_label = self.dataset[i + 1]
            
            # Map labels to indices in the matrix
            today_index = np.where(self.unique_labels == today_label)[0][0]
            tomorrow_index = np.where(self.unique_labels == tomorrow_label)[0][0]
            
            self.prior_matrix[today_index][tomorrow_index] += 1

        

    
    def update_with_new_observation(self, tomorrow_label):
        """
        Update the prior matrix based on a new observation of today's and tomorrow's labels.
        :param tomorrow_label: Tomorrow's observed label
        """
        # Find the indices for today's and tomorrow's labels
        today_index = np.where(self.unique_labels == self.today_label)[0][0]
        tomorrow_index = np.where(self.unique_labels == tomorrow_label)[0][0]
        self.today_label = tomorrow_label
        # Update the count for this transition in the prior matrix
        self.prior_matrix[today_index][tomorrow_index] += 1
        
        # probability of tomorrow's label based on todays 
        tomorrow_count = self.prior_matrix[tomorrow_index]
        tomorrow_probability = tomorrow_count/sum(tomorrow_count)
        return tomorrow_probability

    
    def get_prior_matrix(self):
        """
        Get the current state of the prior matrix.
        :return: The current prior matrix (2D array)
        """
        return self.prior_matrix

In [62]:
  
def main(data, knowledge_before_date, smoothing_alpha = 2):
    # flatten
    prior_knowledge = data.loc[data.index < knowledge_before_date]
    after = data.loc[data.index >= knowledge_before_date]
    after_index = after.index
    # split dataset into prior, and update
    prior_knowledge = prior_knowledge['Group'].values
    after = after['Group'].values

    # initialize Bayesian prior 
    predictor = BayesianLabelPredictor(prior_knowledge, smoothing = smoothing_alpha)
    predictor.calculate_prior()
    # print(predictor.get_prior_matrix())

    # initialize a list of probability 
    probability = []
    for i in range(len(after)):
        tomorrow_probs = predictor.update_with_new_observation(after[i])
        probability.append(tomorrow_probs)
    # print(tomorrow_probs)
    # print(predictor.get_prior_matrix())
    probabilities_df = pd.DataFrame(probability, index=after_index)
    return probabilities_df
    

In [64]:
main(data, '2002-01-01', 2)

Unnamed: 0_level_0,0,1,2,3,4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-01-02,0.268293,0.024390,0.356098,0.292683,0.058537
2002-01-03,0.330000,0.010000,0.270000,0.265000,0.125000
2002-01-04,0.266990,0.024272,0.354369,0.296117,0.058252
2002-01-07,0.270531,0.024155,0.352657,0.294686,0.057971
2002-01-08,0.274038,0.024038,0.350962,0.293269,0.057692
...,...,...,...,...,...
2024-07-24,0.396680,0.033195,0.268050,0.239004,0.063071
2024-07-25,0.453111,0.025416,0.233129,0.222612,0.065732
2024-07-26,0.706106,0.010312,0.139213,0.133786,0.010583
2024-07-29,0.706186,0.010309,0.139175,0.133749,0.010581
