<a href="https://colab.research.google.com/github/biomathematicus/2025SpringMAT5153/blob/main/Assignment_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import numpy as np
import pandas as pd
# create a python dictionary of stock symbols and text names.
symbol_dict = {
    "TOT": "Total",
    "XOM": "Exxon",
    "CVX": "Chevron",
    "COP": "ConocoPhillips",
    "VLO": "Valero Energy",
   }
# "decompose" the dictionary for ease of work - two arrays.
symbols, names = np.array(sorted(symbol_dict.items())).T
# empty list
quotes = []
#
################### To fetch individual CSV files from the internet
# for symbol in symbols:
#     print("Fetching quote history for %r" % symbol, file=sys.stderr)
#     url = (
#         "https://raw.githubusercontent.com/scikit-learn/examples-data/"
#         "master/financial-data/{}.csv"
#     )
#     quotes.append(pd.read_csv(url.format(symbol)))
#
#################### To fetch individual CSV files from local folder
# for symbol in symbols:
#     print("Fetching quote history for %r" % symbol, file=sys.stderr)
#     direct = "examples-data-master/financial-data/{}.csv"
#     quotes.append(pd.read_csv(direct.format(symbol)))
####################
# close_prices = np.vstack([q["close"] for q in quotes])
# open_prices = np.vstack([q["open"] for q in quotes])
# # The daily variations of the quotes are what carry the most information
# variation = close_prices - open_prices
#################### To merge separate CSV files into 1 and write to disk
# data1 = pd.DataFrame(variation[0],columns=[symbols[0]])
# merged_df = data1
# for k in range(1,N):
#     ToBe_merged = pd.DataFrame(variation[k],columns=[symbols[k]])
#     merged_df = merged_df.merge(ToBe_merged,left_index=True,right_index=True)
# merged_df.to_csv('StockVar_Data.csv',index=False)
####################

In [None]:
# The CSV file has column headers that are stock symbols. Natural given
# how the original separate CSV files were merged into one. Pandas can
# easily extract rows or columns; mathematically it is simple a transpose.
# HOWEVER, the scikit-learn routines used for clustering later,
# REQUIRE a specific form or they will "HANG". They expect ROW headers
# to be the stock symbols.
# Method 1: read in CSV took the transpose.
df = pd.read_csv('StockVar_Data.csv')
variation = np.array(df)
variation = variation.T
variation.shape
###############
# Method 2: read in CSV as above & use Pandas to take transpose dataframe, then
# write it out as CSV file. This causes own set of issues: column 0
# is now the Stock symbols and you must drop those. Here is the code.
#
# df_transpose = df.transpose()
# df_transpose.to_csv('StockVar_Data_Adj.csv')
#
# df = pd.read_csv('StockVar_Data_Adj.csv')
# # Convert the dataframe into a numpy array.
# df_filtered = df.iloc[:,1:]
# variation = np.array(df_filtered)
#

In [None]:
from sklearn import covariance

alphas = np.logspace(-1.5, 1, num=10)
edge_model = covariance.GraphicalLassoCV(alphas=alphas)

# standardize the time series: using correlations rather than covariance
# former is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

In [None]:
from sklearn import cluster

_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
n_labels = labels.max()

for i in range(n_labels + 1):
    print(f"Cluster {i + 1}: {', '.join(names[labels == i])}")

In [None]:
print(labels)

In [None]:
from sklearn import manifold

node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver="dense", n_neighbors=6
)

embedding = node_position_model.fit_transform(X.T).T

In [None]:
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

plt.figure(1, facecolor="w", figsize=(10, 8))
plt.clf()
ax = plt.axes([0.0, 0.0, 1.0, 1.0])
plt.axis("off")

# Plot the graph of partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02

# Plot the nodes using the coordinates of our embedding
plt.scatter(
    embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral
)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [
    [embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(
    segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
)
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):
    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = "left"
        x = x + 0.002
    else:
        horizontalalignment = "right"
        x = x - 0.002
    if this_dy > 0:
        verticalalignment = "bottom"
        y = y + 0.002
    else:
        verticalalignment = "top"
        y = y - 0.002
    plt.text(
        x,
        y,
        name,
        size=10,
        horizontalalignment=horizontalalignment,
        verticalalignment=verticalalignment,
        bbox=dict(
            facecolor="w",
            edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
            alpha=0.6,
        ),
    )

plt.xlim(
    embedding[0].min() - 0.15 * np.ptp(embedding[0]),
    embedding[0].max() + 0.10 * np.ptp(embedding[0]),
)
plt.ylim(
    embedding[1].min() - 0.03 * np.ptp(embedding[1]),
    embedding[1].max() + 0.03 * np.ptp(embedding[1]),
)

plt.show()

In [None]:
# Extract appropriate columns from original dataframe
my_names = ['Total', 'Exxon', 'Chevron', 'ConocoPhillips', 'Valero Energy']
my_symbols = []
# for key,value in symbol_dict.items():
#     print(f"{key}: {value}")
for j in range(len(names)):
    if names[j] in my_names:
        my_symbols.append(symbols[j])
print(my_symbols)

dfmy = df[my_symbols]
dfmy.head()

In [None]:
dfmy.plot()

In [None]:

# for more refinement use matplotlib
from matplotlib import pyplot as plt
f = dfmy['TOT']
plt.plot(f)
# Now check out the plot examples on the https://matplotlib.org/ webpages.
# Being able to present strong visualizations of your results at
# conferences and in articles/reports is essential.

In [None]:
from matplotlib import pyplot as plt
f = dfmy['XOM']
plt.plot(f)

In [None]:
from matplotlib import pyplot as plt
f = dfmy['CVX']
plt.plot(f)

In [None]:
from matplotlib import pyplot as plt
f = dfmy['COP']
plt.plot(f)

In [None]:
from matpltlib import pyplot as plt
f = dfmy['VLO']
plt.plot(f)