# Analysis Notebook - 2
Runs evaluation metrics on the results

In [79]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
from PIL import Image
import imageio
import pickle
import pandas as pd
import os
import networkx as nx
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

## Load data

In [80]:
record_timestamp = "20241104_211417"

record_timestamp

'20241104_211417'

In [81]:
with open("gaussian_features_norm.pkl", "rb") as f:
    df_features = pickle.load(f)
nd_features = df_features.values

nd_features.shape

(10000, 6)

In [82]:
with open(
    f"output/{record_timestamp}/{record_timestamp}_analysis_population.pkl",
    "rb",
) as f:
    df_iterations = pickle.load(f)

df_iterations

Unnamed: 0,p1,p2,p3,p4,p5,p6,score,pop,sample_id,id,iteration,path_id,projection_method,corpus_method
0,0.143024,0.519569,0.263512,0.474443,0.220302,0.007967,-1,2,188,200,0,0,tsne,read
1,0.030941,0.598050,0.097447,0.397638,0.158384,0.004574,-1,2,182,201,0,0,tsne,read
2,0.058838,0.698988,0.179665,0.461806,0.290309,0.010648,-1,2,212,202,0,0,tsne,read
3,0.233742,0.554347,0.162495,0.520112,0.238920,0.004570,-1,2,214,203,0,0,tsne,read
4,0.233742,0.554347,0.162495,0.520112,0.238920,0.004570,-1,2,214,204,0,0,tsne,read
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10789,0.076811,0.520883,0.120557,0.434862,0.145791,0.003104,-1,51,195,10989,49,0,tsne,read
10790,0.073570,0.635564,0.371130,0.598242,0.234610,0.033081,-1,51,181,10990,49,0,tsne,read
10791,0.172462,0.497888,0.308367,0.461209,0.206948,0.013699,-1,51,130,10991,49,0,tsne,read
10792,0.107398,0.493049,0.356465,0.604528,0.192708,0.010578,-1,51,208,10992,49,0,tsne,read


## Population Diversity

In [83]:
all_pops_cat = []
for pop in df_iterations["pop"].unique():
    df_gen = df_iterations.loc[
        df_iterations["pop"] == pop, ["p1", "p2", "p3", "p4", "p5", "p6"]
    ]
    # i_bar = df_gen.mean(axis=0)
    # var = np.sum((df_gen - i_bar) ** 2, axis=0) / len(df_gen.index)

    var = df_gen.var(axis=0)
    # input(f"pop {pop} variance: {var}")
    all_pops_cat.append(np.mean(var))

np.array(all_pops_cat).shape

(50,)

In [84]:
# plot variance
plt.plot(all_pops_cat)
plt.xlabel("Population")
plt.ylabel("Variance")
plt.title("Variance of each population")
plt.ylim(0, 0.2)
plt.savefig(f"output/{record_timestamp}/population_diversity.png")
plt.show()

## Categorical Diversity

In [85]:
with open("filenames.pkl", "rb") as f:
    df_filenames = pickle.load(f)
df_filenames.filename = df_filenames.filename.apply(os.path.splitext).str[0]

df_filenames.head()

Unnamed: 0,filename,sample_id
0,349217,0
1,399910,1
2,355149,2
3,35437,3
4,235105,4


In [86]:
# Load the metadata file

df_metacoll = pd.read_csv(
    r"D:\datasets\FSD50K\FSD50K.metadata\collection\collection_dev.csv"
)
df_metacoll.mids = df_metacoll.mids.str.split(",")
df_metacoll.fname = df_metacoll.fname.astype(str)

df_metacoll.head()

Unnamed: 0,fname,labels,mids
0,64760,Electric_guitar,[/m/02sgy]
1,16399,Electric_guitar,[/m/02sgy]
2,16401,Electric_guitar,[/m/02sgy]
3,16402,Electric_guitar,[/m/02sgy]
4,16404,Electric_guitar,[/m/02sgy]


In [87]:
df_iter_w_filename = df_iterations.merge(
    df_filenames, on="sample_id", how="left"
)
df_iter_w_filename.filename = df_iter_w_filename.filename.astype(str)

df_iter_w_filename.head()

Unnamed: 0,p1,p2,p3,p4,p5,p6,score,pop,sample_id,id,iteration,path_id,projection_method,corpus_method,filename
0,0.143024,0.519569,0.263512,0.474443,0.220302,0.007967,-1,2,188,200,0,0,tsne,read,372979
1,0.030941,0.59805,0.097447,0.397638,0.158384,0.004574,-1,2,182,201,0,0,tsne,read,176466
2,0.058838,0.698988,0.179665,0.461806,0.290309,0.010648,-1,2,212,202,0,0,tsne,read,64298
3,0.233742,0.554347,0.162495,0.520112,0.23892,0.00457,-1,2,214,203,0,0,tsne,read,316689
4,0.233742,0.554347,0.162495,0.520112,0.23892,0.00457,-1,2,214,204,0,0,tsne,read,316689


In [88]:
df_ontology_lookup = df_iter_w_filename.merge(
    df_metacoll, left_on="filename", right_on="fname", how="left"
)
df_ontology_lookup["mids_first"] = df_ontology_lookup.mids.str[0]

df_ontology_lookup.head()

Unnamed: 0,p1,p2,p3,p4,p5,p6,score,pop,sample_id,id,iteration,path_id,projection_method,corpus_method,filename,fname,labels,mids,mids_first
0,0.143024,0.519569,0.263512,0.474443,0.220302,0.007967,-1,2,188,200,0,0,tsne,read,372979,372979,Clarinet,[/m/01wy6],/m/01wy6
1,0.030941,0.59805,0.097447,0.397638,0.158384,0.004574,-1,2,182,201,0,0,tsne,read,176466,176466,Piano,[/m/05r5c],/m/05r5c
2,0.058838,0.698988,0.179665,0.461806,0.290309,0.010648,-1,2,212,202,0,0,tsne,read,64298,64298,Keyboard_(musical),[/m/05148p4],/m/05148p4
3,0.233742,0.554347,0.162495,0.520112,0.23892,0.00457,-1,2,214,203,0,0,tsne,read,316689,316689,Train_horn,[/m/0284vy3],/m/0284vy3
4,0.233742,0.554347,0.162495,0.520112,0.23892,0.00457,-1,2,214,204,0,0,tsne,read,316689,316689,Train_horn,[/m/0284vy3],/m/0284vy3


In [89]:
all_pops_cat = []
for pop in df_ontology_lookup["pop"].unique():
    cats = df_ontology_lookup.loc[
        df_ontology_lookup["pop"] == pop, "mids_first"
    ]
    all_pops_cat.append(len(set(cats)) / len(cats))

np.array(all_pops_cat).shape

(50,)

In [90]:
# plot

plt.clf()
plt.plot(all_pops_cat)
plt.xlabel("Population")
plt.ylabel("Mean Category Diversity")
plt.title("Mean category diversity for each population")
plt.ylim(0, 1)
plt.savefig(f"output/{record_timestamp}/category_diversity.png")
plt.show()

## Phylogenetic
Graph operations

In [91]:
# load the graph to nx

with open(
    f"output/{record_timestamp}/{record_timestamp}_analysis_evo_graph.gpickle",
    "rb",
) as f:
    G = pickle.load(f)

G.number_of_nodes()

10994

In [92]:
# get copt of the graph

G_plot = G.copy()


G_plot.number_of_nodes()

10994

In [93]:
# make bidirectional
# by adding the reverse edges

for u, v in G_plot.edges():
    G_plot.add_edge(v, u)

G_plot.number_of_edges()

43052

## Visualize the Graph

In [94]:
# Interactive visualization
%matplotlib qt

In [95]:
# copy and optionally filter out for testing

G_filtered = G_plot.copy()

# filter out nodes
# G_filtered.remove_nodes_from(
#     [node[0] for node in G_plot.nodes(data=True) if node[1]["pop"] < 147]
# )

In [76]:
# Draw the graph multipartite

# get the positions multi-partite
pos = nx.multipartite_layout(G_filtered, subset_key="pop", align="horizontal")

# draw the graph
plt.clf()
nx.draw(
    G_filtered,
    pos,
    # labels=nx.get_node_attributes(G, "label"),
    # with_labels=True,
    node_size=75,
    font_size=10,
    font_weight="bold",
    arrowsize=8,
    width=1,
    alpha=0.75,
)

# add labels
for node in G_filtered.nodes():
    x, y = pos[node]
    label = node  # G.nodes[node]["sample_id"]  # [:20]
    plt.text(x, y, label, fontsize=12, rotation=45, ha="left")

plt.title("Evolutionary Graph")
plt.show()
plt.savefig(f"output/{record_timestamp}/evolutionary_graph.png")

KeyboardInterrupt: 

# Pairwise Distances

In [96]:
all_mpd_pops = []
all_pops_unique = (
    pd.Series([node[1]["pop"] for node in G_plot.nodes(data=True)])
    .sort_values()
    .unique()
)
for pop in all_pops_unique:
    leaf_nodes = [n for n in G_plot.nodes(data=True) if n[1]["pop"] == pop]
    G_upto_pop = G_plot.subgraph(
        [n[0] for n in G_plot.nodes(data=True) if n[1]["pop"] <= pop]
    )

    # for each unique pair of leaf nodes,
    # calculate the shortest path between them
    pairwise_distances = {}
    no_path = []
    for i in range(len(leaf_nodes)):
        for j in range(i + 1, len(leaf_nodes)):
            try:
                dist = nx.shortest_path_length(
                    G_upto_pop, leaf_nodes[i][0], leaf_nodes[j][0]
                )
                if dist > 2:
                    pairwise_distances[(i, j)] = dist + 1
            except nx.NetworkXNoPath:
                no_path.append((i, j))
                # print(f"No path between {leaf_nodes[i][0]} and {leaf_nodes[j][0]}")

    pairwise_distances_values = np.array(list(pairwise_distances.values()))

    mpd_pop = (
        pairwise_distances_values.sum()
        * 2
        / (len(leaf_nodes) * (len(leaf_nodes) - 1))
    )
    all_mpd_pops.append(mpd_pop)
    print(
        f"Population {str(pop).zfill(3)} Mean Pairwise Distances: {mpd_pop.round(2)} ",
        end="\r",
    )

np.array(all_mpd_pops).shape, np.array(no_path).shape

Population 051 Mean Pairwise Distances: 6.93  

((51,), (0,))

In [97]:
# plot

plt.clf()
plt.plot(all_mpd_pops)
plt.xlabel("Population")
plt.ylabel("Mean Pairwise Distance")
plt.title("Mean Pairwise Distances for Each Population")
plt.savefig(f"output/{record_timestamp}/mean_pairwise_distance.png")
plt.show()

# Other

In [396]:
#
# #
# #
# #
# #
#

In [None]:
######## comment out

for iteration in df_iterations.iteration.unique()[:1]:
    df_epoch = df_iterations[df_iterations.iteration == iteration]
    df_epoch = df_epoch[["p1", "p2", "p3", "p4", "p5", "p6"]]

df_epoch

In [None]:
# Scale the features
# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(nd_features)

# Apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42).fit(nd_features)
labels = kmeans.labels_

# Save the DataFrame with cluster labels
df_features["cluster"] = labels
with open("features_clustered_TEST.pkl", "wb") as f:
    pickle.dump(df_features, f)
###

# print(df_features.head())
# print(df_features.describe())
print(df_features.cluster.value_counts())

In [None]:
#### comment out

# find visited indexes
visited_indexes = []
for individual in df_epoch.values:
    for index, feature in enumerate(nd_features):
        if np.array_equal(individual, feature):
            visited_indexes.append(index)
            break
visited_indexes = np.array(visited_indexes)

visited_indexes

In [9]:
# VISUALIZE THE CLUSTERS (2D) with t-SNE
tsne = TSNE(n_components=2, random_state=0)
features_data_tsne = tsne.fit_transform(nd_features)
df_tsne = pd.DataFrame(features_data_tsne, columns=["x", "y"])

output_dir = f"output/{record_timestamp}/clusters"
os.makedirs(output_dir, exist_ok=True)

figure_filenames = []
for i, iteration in enumerate(df_iterations.iteration.unique()[:10]):
    df_epoch = df_iterations[df_iterations.iteration == iteration]
    df_epoch = df_epoch[["p1", "p2", "p3", "p4", "p5", "p6"]]

    # find visited indexes
    visited_indexes = []
    for individual in df_epoch.values:
        for index, feature in enumerate(nd_features):
            if np.array_equal(individual, feature):
                visited_indexes.append(index)
                break
    visited_indexes = np.array(visited_indexes)

    df_tsne["cluster"] = labels

    plt.clf()
    plt.figure(figsize=(10, 10))
    plt.scatter(
        df_tsne["x"], df_tsne["y"], s=1, c=df_tsne["cluster"], cmap="viridis"
    )
    plt.scatter(
        df_tsne["x"].iloc[visited_indexes],
        df_tsne["y"].iloc[visited_indexes],
        c="red",
        s=3,
        marker="^",
    )
    # plt.show()

    output_file = os.path.join(output_dir, f"generation_{i}.png")
    plt.savefig(output_file)
    figure_filenames.append(output_file)

"end"

In [8]:
# make a gif

# Load each figure and append it to the images list
images = []
for filename in figure_filenames:
    image = Image.open(filename)
    images.append(image)

# Save the images as a GIF
output_filename = os.path.join(output_dir, "evolution_anim.gif")
imageio.mimsave(output_filename, images, duration=0.1)

# Remove the individual figure files
# for filename in figure_filenames:
#     os.remove(filename)

In [13]:
# VISUALIZE THE CLUSTERS (3D) with t-SNE
tsne = TSNE(n_components=3, random_state=0)
features_data_tsne = tsne.fit_transform(nd_features)

df_tsne = pd.DataFrame(features_data_tsne, columns=["x", "y", "z"])
df_tsne["cluster"] = labels

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    df_tsne["x"],
    df_tsne["y"],
    df_tsne["z"],
    c=df_tsne["cluster"],
    cmap="viridis",
)
plt.show()
plt.savefig("clusters_3d_TEST.png")