# T1 

### Linear Regress

In [None]:
class DebugMarkovChain(MarkovChain):
    
    def __init__(self, markov_chain_obj, states_dict):
        # Assuming markov_chain_obj has a 'states' attribute and a 'transition_matrix' attribute
        super().__init__(markov_chain_obj.states, states_dict)  # Call the parent class's __init__ method
        self.transition_matrix = markov_chain_obj.transition_matrix  # Copy the transition matrix
        self.classifier = DebugStateClassifier(states_dict)
        self.states_dict = states_dict
        
    def current_state(self, pair, current_sample):
        activation_model = self.states_dict[pair]['activation_model']
        current_activations = activation_model.predict(current_sample)
​
        # Use DBSCAN to identify the state
        db = self.states_dict[pair]['DB_scan']
        closest_index, _ = pairwise_distances_argmin_min(current_activations, db.components_)
        current_state = db.labels_[closest_index][0]
​
        return current_state
        
    def substate_create_new_probability_matrix(self, original_matrix):
        new_prob_matrix = {}
        for pair, transitions in original_matrix.items():
            pair_prob_matrix = {}
            for from_state, to_states in transitions.items():
                # Remove this line to include all types of states
                if isinstance(from_state, (int, np.int64)) or (isinstance(from_state, str)):  # Add this line to check the type of key
                    total_transitions = sum(to_states.values())
                    pair_prob_matrix[from_state] = {to_state: count / total_transitions for to_state, count in to_states.items()}
            new_prob_matrix[pair] = pair_prob_matrix
        return new_prob_matrix
​
    def substate_update_transition_matrix(self, original_matrix, pair, sub_states, meta_state, next_meta_state=None):
        new_transitions = defaultdict(lambda: defaultdict(int))
        sub_states = [int(s) for s in sub_states]
        meta_state = int(meta_state)  # Ensure meta_state is a native Python integer
​
        if meta_state not in new_transitions:
            new_transitions[meta_state] = defaultdict(int)
            # print('metastate not in new_transitions. adding now.')
        if f"{meta_state}-{sub_states[0]}" not in new_transitions[meta_state]:
            # print('metastate not in new_transitions meta state. adding now')
            new_transitions[meta_state][f"{meta_state}-{sub_states[0]}"] = 0
            
        new_transitions[meta_state][f"{meta_state}-{sub_states[0]}"] += 1
        print(f'Added count of {new_transitions[meta_state]}')
        
        # Adding transitions between sub-states
        for i in range(len(sub_states) - 1):
            from_state = f"{meta_state}-{sub_states[i]}"
            to_state = f"{meta_state}-{sub_states[i + 1]}"
            new_transitions[from_state][to_state] += 1
​
        # Adding transitions from meta-state to the first sub-state in each sequence
        new_transitions[meta_state][f"{meta_state}-{sub_states[0]}"] += 1
​
        # Adding transitions from the last sub-state in each sequence to the next meta-state if provided
        if next_meta_state is not None:
            new_transitions[f"{meta_state}-{sub_states[-1]}"][next_meta_state] += 1
        else:
            # If next meta-state is not provided, transition back to the same meta-state
            new_transitions[f"{meta_state}-{sub_states[-1]}"][meta_state] += 1
​
        # Merge new transitions into the original matrix for the specific pair
        if pair not in original_matrix:
            original_matrix[pair] = {}
​
        # Merge new transitions into the original matrix for the specific pair
        if pair not in original_matrix:
            original_matrix[pair] = {}
​
        for from_state, to_states in new_transitions.items():
            if from_state not in original_matrix[pair]:
                # print(f"Adding new from_state {from_state} to original_matrix")
                original_matrix[pair][from_state] = {}
​
            for to_state, count in to_states.items():
                if to_state not in original_matrix[pair][from_state]:
                    # print(f"Adding new to_state {to_state} to original_matrix[{pair}][{from_state}]")
                    original_matrix[pair][from_state][to_state] = 0
​
                original_matrix[pair][from_state][to_state] += count
                # print(f"Updated count for original_matrix[{pair}][{from_state}][{to_state}] to {original_matrix[pair][from_state][to_state]}")
class DebugStateClassifier:
    def __init__(self, states_dict):
        self.states_dict = states_dict
        # Initialize NearestNeighbors models for each pair
        self.nn_models = {}
        for pair, values in self.states_dict.items():
            activations = values["activation_model"].predict(values["data"])  # Assuming "data" contains original features for each pair
            self.nn_models[pair] = NearestNeighbors(n_neighbors=1).fit(activations)
​
    def classify_sample(self, sample, pair):
        activation = self.states_dict[pair]["activation_model"].predict(sample)
        distance, index = self.nn_models[pair].kneighbors(activation)
        states = self.states_dict[pair]["states"]
        state = states[index[0][0]]
        return state

In [None]:

# Define the objective function to minimize (MSE)
def objective(params):
    predicted = np.dot(X_test, params)
    mse = np.mean((predicted - y_test) ** 2)
    return mse

predictions = {} 

for pair, features_df in features_dict.items():
    
    # Prepare the data
    X = features_df.values  # Feature values
    y = coint_dict[pair].values[59:]  # All values as target

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a linear regression model as an example
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    # print(f"Mean Squared Error for {pair}: {mse}")
    
     # Initialize the parameters for optimization (e.g., as all ones)
    initial_params = np.ones(X_test.shape[1])

    # Use simulated annealing to optimize the parameters
    result = opt.basinhopping(objective, initial_params, niter=100, stepsize=0.5)

    # Get the optimized parameters
    optimized_params = result.x

    # Re-predict using the optimized parameters
    y_pred_optimized = np.dot(X_test, optimized_params)

    # Save the prediction
    predictions[pair] = y_pred_optimized[0]

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred_optimized)
    r_squared_optimized = r2_score(y_test, y_pred_optimized)
    print(f"Mean Squared Error for {pair}: {mse}")
    # print(f'R-squared (R^2) for {pair}: {r_squared_optimized}')

print(predictions)

In [None]:
import datetime

# Define the objective function to minimize (MSE)
def objective(params):
    predicted = np.dot(X_test, params)
    mse = np.mean((predicted - y_test) ** 2)
    return mse

predictions_today = {} 
predictions_tomorrow = {} 
change_in_predictions = {}

current_datetime = datetime.datetime.now()
current_date_str = current_datetime.strftime('%Y-%m-%d %H:%M:%S %Z')

def standard_linear_regress():
    
    print('Beginning Normal Standard Linear Regression')
    print('')
    for pair, features_df in features_dict.items():
        # Prepare the data
        X = features_df.values[:-2]  # Exclude last two values for today's and tomorrow's prediction
        y = coint_dict[pair].values[59:-2]  # Similarly, exclude the last two values 

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train a linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Predict for today and tomorrow using the most recent features
        prediction_today = model.predict([features_df.iloc[-2, :].values])[0]
        prediction_tomorrow = model.predict([features_df.iloc[-1, :].values])[0]

        predictions_today[pair] = prediction_today
        predictions_tomorrow[pair] = prediction_tomorrow
        change_in_predictions[pair] = prediction_today - model.predict([features_df.iloc[-3, :].values])[0]

    print(f"Time is {current_date_str}.")
    for pair in predictions_today.keys():
        current_price = round(compute_spread(pair).iloc[-1], 5)
        print(f"For {pair}:")
        print(f"Today's prediction: {predictions_today[pair]:.5f}. Current price: {current_price}")
        print(f"Tomorrow's prediction: {predictions_tomorrow[pair]:.5f}")
        print(f"Change in prediction compared to yesterday: {change_in_predictions[pair]:.5f}")
        print("-----")

standard_linear_regress()

### ARIMA

In [None]:
#TESTING
for pair, data in coint_dict.items():
    print(f"Running through {pair}")
    split = int(len(data)*0.80)
    train_set, test_set = data[:split], data[split:]
    
    model = ARIMA(train_set, order=(2, 1, 2))
    model_fit_0 = model.fit()
    
    # Convert pandas series to list for rolling window forecast
    past = train_set.tolist()

    # Empty list for storing predictions
    predictions = []

    # Keeping only the first 50 data in the test dataset.
    # You can run on the whole dataset, but it will take time to run.
    test_set = test_set[:50]

    # Perform rolling window forecast
    for i in range(len(test_set)):
        # Define ARIMA model
        model = ARIMA(past, order=(2, 1, 2))
        # Fit the model
        model_fit = model.fit(start_params=model_fit_0.params)
        # Make forecast
        forecast_results = model_fit.forecast()
        pred = forecast_results[0]
        # Append prediction
        predictions.append(pred)
        # Add test value to train set
        past.append(test_set[i])

NameError: name 'coint_dict' is not defined

In [None]:
# AR AND MA
from statsmodels.graphics.tsaplots import plot_pacf
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
plt.style.use('seaborn-darkgrid')

for pair, data in coint_dict.items():
    print(f"Running through {pair}")
    split = int(len(data)*0.80)
    train_set, test_set = data[:split], data[split:]

    # Empty list to store aic/bic score
    aic_p = []
    bic_p = []

    # p values
    p = range(1, 6)  # [1,2,3,4,5]

    # AIC/BIC score for different values of p
    for i in p:
        # Define the AR model
        model = ARIMA(train_set, order=(i, 1, 0))  
        # Fit the model
        model_fit = model.fit()
        # Get AIC score
        aic_temp = model_fit.aic  
        # Get BIC score
        bic_temp = model_fit.bic
        # Append AIC score
        aic_p.append(aic_temp) 
        # Append BIC score
        bic_p.append(bic_temp) 

    # Plot of AIC/BIC score for AR term
    plt.figure(figsize=(15, 7))
    plt.plot(range(1, 6), aic_p, color='red')
    plt.plot(range(1, 6), bic_p)
    plt.title('Tuning AR term')
    plt.xlabel('p (AR term)')
    plt.ylabel('AIC/BIC score')
    plt.legend(['AIC score', 'BIC score'])
    plt.show() 
    
for pair, data in coint_dict.items():
    print(f"Running through {pair}")
    split = int(len(data)*0.80)
    train_set, test_set = data[:split], data[split:]

    # Empty list to store AIC/BIC score
    aic_q = []
    bic_q = []

    # q values
    q = range(1, 6)

    # AIC/BIC score for different values of q
    for i in q:
        model = ARIMA(train_set, order=(0, 1, i))
        model_fit = model.fit()
        aic_temp = model_fit.aic
        bic_temp = model_fit.bic
        aic_q.append(aic_temp)
        bic_q.append(bic_temp)

    # Plot of AIC/BIC score for MA term
    plt.figure(figsize=(15, 7))
    plt.plot(range(1, 6), aic_q, color='red')
    plt.plot(range(1, 6), bic_q)
    plt.title('Tuning MA term')
    plt.xlabel('q (MA term)')
    plt.ylabel('AIC/BIC score')
    plt.legend(['AIC score', 'BIC score'])
    plt.show()