In [None]:
from calibrate.cluster_model import ClusterFactory, ClusteringOptimizer
from calibrate.cointegration import PriceData, CointegrrationTest
from calibrate.OU_calibration import OUCalibration
from calibrate.baskets import Basket
from strategy.simple_implementation import OUTrader
import matplotlib

2010-01-01


In [2]:
# Finding optimum clustering method:


# From data parsing
sector = "Information Technology"
start_date = 20100101
end_date = 20170101

results = {}
for method in ["kmeans", "gmm", "hierarchical", "spectral"]:
    print(f"\n--- Testing {method.upper()} ---")
    model = ClusterFactory.create(method, sector, start_date, end_date)
    optimizer = ClusteringOptimizer(model)
    best_params, best_score, all_results = optimizer.search()
    print(f"Best_parameters: {best_params}")
    print(f"custom score: {best_score}")
    results[method] = (best_params, best_score)







--- Testing KMEANS ---
Best_parameters: {'n_clusters': 10}
custom score: 0.8903151363655653

--- Testing GMM ---
Best_parameters: {'n_components': 10, 'covariance_type': 'full'}
custom score: 0.9957682241937804

--- Testing HIERARCHICAL ---
Best_parameters: {'n_clusters': 9, 'linkage': 'average'}
custom score: 0.8989953733213722

--- Testing SPECTRAL ---
Best_parameters: {'n_clusters': 10, 'affinity': 'nearest_neighbors'}
custom score: 0.6654449149537476


In [3]:
best_method, (best_params, best_score) = max(results.items(), key=lambda x: x[1][1])

print(f"\nBest method: {best_method.upper()} with score {best_score}")

model = ClusterFactory.create(best_method, sector, start_date, end_date, **best_params)

model.fit()
model.show_baskets()
model.evaluate()
clustered_data = model.get_clustered_data()

price_data = PriceData.get_time_series(clustered_data, start_date, end_date)


Best method: GMM with score 0.9957682241937804

Cluster 0 (7 items):
['AKAM', 'AMAT', 'AVGO', 'EA', 'JNPR', 'LRCX', 'CRM']

Cluster 1 (14 items):
['ACN', 'GOOGL', 'GOOG', 'APH', 'ANSS', 'CSCO', 'IT', 'GPN', 'IBM', 'MCHP', 'MSFT', 'QCOM', 'TEL', 'VRSN']

Cluster 2 (2 items):
['AAPL', 'NVDA']

Cluster 3 (2 items):
['AMD', 'MU']

Cluster 4 (5 items):
['ADSK', 'FFIV', 'HPE', 'STX', 'WDC']

Cluster 5 (6 items):
['ADP', 'FIS', 'INTU', 'MSI', 'PAYX', 'SNPS']

Cluster 6 (2 items):
['QRVO', 'SWKS']

Cluster 7 (7 items):
['CDNS', 'GLW', 'DXC', 'HPQ', 'KLAC', 'NTAP', 'XRX']

Cluster 8 (3 items):
['CSRA', 'NFLX', 'PYPL']

Cluster 9 (10 items):
['ADBE', 'ADI', 'CTSH', 'EBAY', 'INTC', 'MA', 'ORCL', 'TXN', 'V', 'WU']


  prices = yf.download(batch, start=start_date, end=end_date)['Close']
[*********************100%***********************]  20 of 20 completed
  prices = yf.download(batch, start=start_date, end=end_date)['Close']
[*********************100%***********************]  20 of 20 completed

1 Failed download:
['JNPR']: YFPricesMissingError('possibly delisted; no price data found  (1d 2010-01-01 -> 2017-01-01) (Yahoo error = "No data found, symbol may be delisted")')
  prices = yf.download(batch, start=start_date, end=end_date)['Close']
[*********************100%***********************]  18 of 18 completed


In [4]:
cointegration_test = CointegrrationTest(price_data, clustered_data)
results = cointegration_test.run_cointegration_tests()
print("\n--- Cointegration Test Results ---")
for cluster, result in results.items():
    print(f"Cluster: {cluster}")
    print(f"  Type: {result.type}")
    print(f"  Cointegrated: {result.cointegrated}")
    if result.type == 'Engle-Granger':
        print(f"  P-value: {result.p_value}")
        print(f"  Alpha: {result.alpha}")
        print(f"  Beta: {result.beta}")
        print(f"  Ticker 1: {result.ticker_1}")
        print(f"  Ticker 2: {result.ticker_2}")
    elif result.type == 'Johansen':
        print(f"  Rank: {result.rank}")
        print(f"  Chosen Weights: {result.chosen_weights}")
        print(f"  Ticker List: {result.ticker_list}")

  result = coint_johansen(series, det_order=0, k_ar_diff=1)


Running cointegration tests...
series1: 1762, series2: 1762
series1: 1762, series2: 1762
series1: 504, series2: 504

--- Cointegration Test Results ---
Cluster: 1
  Type: Johansen
  Cointegrated: True
  Rank: 2
  Chosen Weights: [ 1.          4.05831598 -4.27325294 -0.62854084 -0.64122908  0.41614516
 -0.42322398 -0.4785672  -1.15296206  0.0338679  -0.0457668   1.10905596
 -0.03383882  0.74498771]
  Ticker List: ['ACN', 'GOOGL', 'GOOG', 'APH', 'ANSS', 'CSCO', 'IT', 'GPN', 'IBM', 'MCHP', 'MSFT', 'QCOM', 'TEL', 'VRSN']
Cluster: 9
  Type: Johansen
  Cointegrated: False
  Rank: None
  Chosen Weights: None
  Ticker List: ['ADBE', 'ADI', 'CTSH', 'EBAY', 'INTC', 'MA', 'ORCL', 'TXN', 'V', 'WU']
Cluster: 3
  Type: Engle-Granger
  Cointegrated: False
  P-value: 0.7994944493392497
  Alpha: 0.9922153679162885
  Beta: -0.32042697415794075
  Ticker 1: AMD
  Ticker 2: MU
Cluster: 0
  Type: Johansen
  Cointegrated: False
  Rank: None
  Chosen Weights: None
  Ticker List: ['AKAM', 'AMAT', 'AVGO', 'EA',

In [5]:
cointegrated_clustrs = cointegration_test.get_cointegrated_clusters()
print("\n--- Cointegrated Clusters ---")
for cluster in cointegrated_clustrs:
    print(f"Cluster: {cluster}")
    print(f"  Cointegration Type: {cointegrated_clustrs[cluster].type}")
    if cointegrated_clustrs[cluster].type == 'Engle-Granger':
        print(f"  P-value: {cointegrated_clustrs[cluster].p_value}")
        print(f"  Alpha: {cointegrated_clustrs[cluster].alpha}")
        print(f"  Beta: {cointegrated_clustrs[cluster].beta}")
        print(f"  Ticker 1: {cointegrated_clustrs[cluster].ticker_1}")
        print(f"  Ticker 2: {cointegrated_clustrs[cluster].ticker_2}")
    elif cointegrated_clustrs[cluster].type == 'Johansen':
        print(f"  Rank: {cointegrated_clustrs[cluster].rank}")
        print(f"  Chosen Weights: {cointegrated_clustrs[cluster].chosen_weights}")
        print(f"  Ticker List: {', '.join(cointegrated_clustrs[cluster].ticker_list)}")

   

Running cointegration tests...
series1: 1762, series2: 1762


  result = coint_johansen(series, det_order=0, k_ar_diff=1)


series1: 1762, series2: 1762
series1: 504, series2: 504

--- Cointegrated Clusters ---
Cluster: 1
  Cointegration Type: Johansen
  Rank: 2
  Chosen Weights: [ 1.          4.05831598 -4.27325294 -0.62854084 -0.64122908  0.41614516
 -0.42322398 -0.4785672  -1.15296206  0.0338679  -0.0457668   1.10905596
 -0.03383882  0.74498771]
  Ticker List: ACN, GOOGL, GOOG, APH, ANSS, CSCO, IT, GPN, IBM, MCHP, MSFT, QCOM, TEL, VRSN
Cluster: 7
  Cointegration Type: Johansen
  Rank: 1
  Chosen Weights: [  1.         157.75303688 -41.60020207 -47.14473777 -26.10948774
 -12.39030415 -23.9982515 ]
  Ticker List: CDNS, GLW, DXC, HPQ, KLAC, NTAP, XRX
Cluster: 8
  Cointegration Type: Johansen
  Rank: 1
  Chosen Weights: [ 1.         -0.68700063  0.14611871]
  Ticker List: CSRA, NFLX, PYPL
Cluster: 6
  Cointegration Type: Engle-Granger
  P-value: 0.19850443490326009
  Alpha: -0.13187555959590375
  Beta: 1.0405672675935893
  Ticker 1: QRVO
  Ticker 2: SWKS


In [6]:
oucal = OUCalibration(price_data, cointegrated_clustrs)
spreads = oucal.form_spread()

# Step 2: fit OU parameters for each spread
ou_params = oucal.fit_ou_parameters(spreads)

for cluster in ou_params:
    print(f"Cluster {cluster}:")
    result = ou_params[cluster]
    theta, mu, sigma, half_life = result.theta, result.mu, result.sigma, result.half_life
    print(f" theta={theta:.4f}, mean={mu:.4f}, vol={sigma:.4f}, half-life={half_life:.2f}")


Cluster 1:
 theta=0.0797, mean=-1.2634, vol=0.0264, half-life=8.70
Cluster 7:
 theta=0.0253, mean=-15.9782, vol=2.4449, half-life=27.43
Cluster 8:
 theta=0.0931, mean=0.6055, vol=0.0282, half-life=7.44
Cluster 6:
 theta=0.0216, mean=-0.4666, vol=0.0231, half-life=32.08


In [7]:
basket_list = []
for cluster in ou_params:
    cluster_id = cluster
    print(f"Cluster id: {cluster_id}")
    params = ou_params[cluster]
    cointegration_result = cointegrated_clustrs[cluster]
    basket = Basket(cluster_id,cointegration_result,params)
    basket_list.append(basket)

Cluster id: 1
Cluster id: 7
Cluster id: 8
Cluster id: 6


In [None]:
start_date = 20180101
end_date = 20250701

trader = OUTrader(start_date, end_date, basket_list)

trading_results = trader.run()

  prices = yf.download(batch, start=start_date, end=end_date)['Close']
[*********************100%***********************]  20 of 20 completed
  prices = yf.download(batch, start=start_date, end=end_date)['Close']
[*********************100%***********************]  6 of 6 completed

{'portfolio_pnl': Date
2018-01-02    0.000000e+00
2018-01-03   -3.054602e+07
2018-01-04   -1.610334e+08
2018-01-05   -3.578829e+08
2018-01-08   -1.220091e+07
                  ...     
2025-06-24   -2.467128e+09
2025-06-25   -2.679877e+08
2025-06-26   -5.078687e+08
2025-06-27    9.038422e+08
2025-06-30   -1.195928e+08
Length: 1883, dtype: float64, 'portfolio_cum_pnl': Date
2018-01-02    0.000000e+00
2018-01-03   -3.054602e+07
2018-01-04   -1.915794e+08
2018-01-05   -5.494623e+08
2018-01-08   -5.616632e+08
                  ...     
2025-06-24   -3.572732e+10
2025-06-25   -3.599531e+10
2025-06-26   -3.650318e+10
2025-06-27   -3.559934e+10
2025-06-30   -3.571893e+10
Length: 1883, dtype: float64, 'basket_results': {np.int64(1): {'spread': Date
2018-01-02    -52.294584
2018-01-03    -56.346042
2018-01-04    -58.701253
2018-01-05    -58.383274
2018-01-08    -58.961077
                 ...    
2025-06-24   -175.668346
2025-06-25   -183.760601
2025-06-26   -183.315173
2025-06-27   -176.139778


