In [1]:
import numpy as np
import pandas as pd

"""read data from daily_return_500_2020_2022.csv""" 
df = pd.read_csv('daily_return_500_2020_2022.csv', index_col=0)

In [2]:
"""e(x,y) is a function that x, y are pandas series and return the square root of the culmulation difference of the two series"""
stock_names = df.columns.to_numpy()
stock_names = stock_names[1:]
stock_index = np.arange(0, len(stock_names))
numpy_stock_return = df.iloc[:, 1:].to_numpy()
numpy_stock_return = numpy_stock_return.T
"""now, the numpy_stock_return is 496 * 754, each row is a stock's return in the past two years"""
"""define a function e(x,y) where x, y are stock_index"""
def e(x, y):
    log_return_x_array = np.log(1 + numpy_stock_return[x])
    log_return_y_array = np.log(1 + numpy_stock_return[y])
    log_return_x = np.sum(log_return_x_array)
    log_return_y = np.sum(log_return_y_array)
    return_x = np.exp(log_return_x)
    return_y = np.exp(log_return_y)

    return np.sqrt((return_x - return_y) ** 2)

"""define a function t(x,y) where x, y are stock names that return the modified correlation between x and y"""
def t(x, y):
    numerator = np.dot(numpy_stock_return[x], numpy_stock_return[y])
    var_x = np.dot(numpy_stock_return[x], numpy_stock_return[x])
    var_y = np.dot(numpy_stock_return[y], numpy_stock_return[y])

    return numerator / np.sqrt(var_x * var_y)


"""define a function sim(x,y) to reflect the similarities between x and y. w is a hyperparameter"""
def sim(x, y, w):
    a = w / (1 + e(x,y))
    b = (1 - w) * t(x,y)
    return a + b



In [3]:
"""implement a simulated annealing algorithm to find the best clustering"""
"""first, we need to define a function to calculate the cost of a clustering"""
w = 0.5 #you can change the value of w, which is usually between 0.4 and 0.6
k = 0.0001

def number_of_clusters(clustering):
    cluster_numbers = 0
    visited_cluster = []
    for element in clustering:
        if element not in visited_cluster:
            visited_cluster.append(element)
            cluster_numbers = cluster_numbers + 1
    return cluster_numbers

def cost(clustering): 
    """clustering is a list of the same length of stocks, and each element is the cluster that the stock belongs to"""
    cost = 0
    cluster_numbers = number_of_clusters(clustering)

    for i in range(0, len(clustering)):
        for j in range(0, len(clustering)):
            if clustering[i] == clustering[j]:
                cost = cost + sim(i, j, w)
    
    new_cost = cluster_numbers - (k / (cluster_numbers - 1)) * cost
    return new_cost 


"""define a function to generate a random clustering"""
def random_clustering():
    n = 25 # n is the number of clusters, you can change the value of n
    clustering = np.zeros(len(stock_names))
    for i in range(0, len(stock_names)):
        clustering[i] = np.random.randint(0, n) 
    return clustering

"""define a function to generate a neighbor of a clustering"""
def neighbor(clustering):
    n = 25 # n is the number of clusters, you can change the value of n, but should be the same as n in the previous function
    neighbor = clustering.copy()
    index = np.random.randint(0, len(stock_names))
    neighbor[index] = np.random.randint(0, n)
    return neighbor, index

"""define a function to calculate the probability of accepting a neighbor"""
def probability(cost, neighbor_cost, temperature):
    if neighbor_cost < cost:
        return 1
    else:
        return np.exp((cost - neighbor_cost) / temperature)

"""define a function to update the temperature"""
def update_temperature(temperature):
    alpha = 0.99 # alpha is a hyperparameter, you can change the value of alpha
    return temperature * alpha

"""define a function to run the simulated annealing algorithm"""
def simulated_annealing():
    temperature = 100 # temperature is a hyperparameter, you can change the value of temperature
    clustering = random_clustering()
    cost_clustering = cost(clustering)
    T_final = 0.1 # T_final is a hyperparameter, you can change the value of T_final
    while temperature > T_final:
        neighbor_clustering, index = neighbor(clustering)
        change = 0
        cluster_numbers = number_of_clusters(clustering)
        for i in range(0, len(stock_names)):
            if i != index and clustering[i] == clustering[index]:
                change = change + sim(i, index, w) * (k / (cluster_numbers - 1))
            if i != index and neighbor_clustering[i] == neighbor_clustering[index]:
                change = change - sim(i, index, w) * (k / (cluster_numbers - 1))
        neighbor_cost_clustering = cost_clustering + change
        if probability(cost_clustering, neighbor_cost_clustering, temperature) > np.random.random():
            clustering = neighbor_clustering
            cost_clustering = neighbor_cost_clustering
        temperature = update_temperature(temperature)
    return clustering


In [4]:
"""run the simulated annealing algorithm 100 times and find the best clustering"""
best_clustering = random_clustering()
best_cost = cost(best_clustering)
for i in range(0, 100):
    clustering = simulated_annealing()
    if cost(clustering) > best_cost:
        best_clustering = clustering
        best_cost = cost(clustering)

"""print the best clustering"""
print(best_clustering)

"""print the number of clusters"""
print(number_of_clusters(best_clustering))

[24.  2.  4.  2.  5.  1. 20.  8.  9.  6.  9.  0. 16. 16. 10. 14. 13.  7.
  0. 21.  6.  4. 16. 20.  8. 22.  0.  3. 12. 14.  9.  0. 10.  2.  3.  7.
  2. 10. 18. 13. 19.  2.  3.  2.  9. 23.  9. 11.  1. 23.  4. 21. 10.  9.
 10.  5. 23.  8.  9. 17. 24. 22. 23.  1. 19.  3. 22.  9. 10. 24.  1.  2.
 22.  8. 15. 13. 22. 14. 11. 12.  0.  3.  6.  8.  5. 10.  7.  4.  2.  9.
 15.  6.  9. 19.  6.  4. 10. 13.  1. 15. 11. 12.  9. 17. 19. 19. 11. 10.
 15.  8.  3.  6. 20.  3. 18. 22.  7. 12. 17. 24. 14.  0. 18. 17. 20. 12.
 21. 20.  8.  3.  7. 11. 24. 14. 18.  9.  0. 21. 18.  5. 16. 23.  9. 14.
  6.  0.  3.  8. 22. 13. 13.  5. 15. 11. 12. 13.  3. 17. 21.  0.  5.  2.
  7. 19. 23.  1.  5. 12. 12. 21.  3.  3. 12. 21. 23. 13.  1.  7. 13.  8.
  8.  0. 16. 20.  8.  1. 12.  0. 15. 19. 14. 16. 11. 10.  5.  7.  3. 21.
 24. 14. 10.  6. 19. 11.  2. 21. 15. 22. 14. 13. 13. 22. 22. 17. 16.  7.
  5. 24. 12.  4. 14. 13. 16.  4. 12.  1. 15.  6.  6.  5. 14. 17.  5. 15.
 19. 15.  5.  9. 16. 17.  0. 18. 24. 17.  6.  2. 22

In [5]:

from collections import defaultdict
table = defaultdict(list)
for i in range(0, len(best_clustering)):
    cluster = best_clustering[i]
    table[cluster].append(stock_names[i])

for key, value in table.items():
    print(key, value)


24.0 ['A', 'BIIB', 'BSX', 'CVS', 'DISH', 'GLW', 'HOLX', 'ISRG', 'KEY', 'MAA', 'MPC', 'PFE', 'PKG', 'POOL', 'SWKS', 'SYK', 'VRSK', 'WELL', 'WHR', 'XYL']
2.0 ['AAL', 'AAPL', 'AMT', 'ANSS', 'APH', 'ARE', 'BXP', 'CHRW', 'ES', 'GPN', 'IVZ', 'JNPR', 'KMI', 'LKQ', 'MET', 'MLM', 'PCG', 'RMD', 'TSN', 'WMB', 'XRAY']
4.0 ['AAP', 'AKAM', 'AXP', 'CHD', 'CMCSA', 'HPE', 'HST', 'JBHT', 'NRG', 'NUE', 'PCAR', 'PEG', 'PKI', 'PM', 'ROL', 'SBAC', 'SPG', 'UNH', 'VFC', 'VZ']
5.0 ['ABBV', 'BAX', 'CE', 'DTE', 'EL', 'EQT', 'EVRG', 'GEN', 'HLT', 'IDXX', 'ILMN', 'INVH', 'MAS', 'MO', 'MTB', 'PAYC', 'WST']
1.0 ['ABC', 'AVY', 'BKNG', 'BWA', 'CMI', 'ETSY', 'FDS', 'FOX', 'HUM', 'LEN', 'MAR', 'MDT', 'MTCH', 'NEE', 'NWSA', 'PEAK', 'PSX', 'REGN', 'RF', 'RSG', 'SJM', 'TRMB', 'VLO', 'WYNN', 'YUM']
20.0 ['ABT', 'ALGN', 'CSGP', 'DD', 'DG', 'FLT', 'LLY', 'MOH', 'NOC', 'O', 'PTC', 'SHW', 'UNP', 'VTR', 'WY']
8.0 ['ACGL', 'ALK', 'BBY', 'CAG', 'CDW', 'CRL', 'DGX', 'ECL', 'FFIV', 'FIS', 'FMC', 'KDP', 'MTD', 'NCLH', 'ODFL', 'PARA',

In [8]:
"""randomly picking a number from list"""
import random
def random_pick(some_list, probabilities):
    x = random.uniform(0, 1)
    cumulative_probability = 0.0
    for item, item_probability in zip(some_list, probabilities):
        cumulative_probability += item_probability
        if x < cumulative_probability: break
    return item

ticker = []
for key in table.keys():
    tick = random_pick(table[key], [1/len(table[key])]*len(table[key]))
    ticker.append(tick)

print(ticker)



['A', 'JNPR', 'JBHT', 'PAYC', 'ABC', 'CSGP', 'CAG', 'IP', 'XOM', 'FISV', 'STZ', 'CF', 'AMD', 'LOW', 'DHR', 'AIZ', 'HD', 'EOG', 'F', 'CTAS', 'VRSN', 'DVA', 'TGT', 'URI', 'VICI']


In [9]:
"""use yfinance to get 1 minute data for stock tickers for the past 30 days"""
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime
tickers = ticker
start = datetime.datetime(2020, 1, 1)
end = datetime.datetime(2022, 12, 30)
data = yf.download(tickers, start, end)

[*********************100%***********************]  25 of 25 completed


In [10]:
"""let the first row be the column names"""
data = data.stack(level=0).rename_axis(['Date', 'Ticker']).reset_index(level=1)


In [11]:
"""only keep rows with column names in the ticker list is close"""
data = data[data['Ticker'] == 'Close']
"""store the data in csv file"""
data.to_csv('2_min_interval.csv')


