In [9]:
import pandas as pd
from umap import umap_
import plotly.express as px
from pathlib import Path
from ipywidgets import interact, IntSlider, FloatSlider, Dropdown

In [None]:



HERE = Path(__file__).parent.resolve() if "__file__" in globals() else Path().resolve()

df_turkic = pd.read_csv(HERE / "turkicDefAncientScaledG25.csv",
                        index_col=0, header=None)
df_all    = pd.read_csv(HERE / "allAncientScaledG25.csv",
                        index_col=0, header=None)




df_turkic['label'] = 'Turkic'  
df_all['label']    = 'Other'    

df = (pd.concat([df_turkic, df_all], ignore_index=True)
        .drop_duplicates()
        .reset_index(drop=True))

X = df.drop(columns='label').values
y = df['label'].values



In [None]:
def run_umap(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=0):
    reducer = umap_.UMAP(
        n_components=2,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
    )
    embedding = reducer.fit_transform(X)
    emb_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
    emb_df['label'] = y

    fig = px.scatter(
        emb_df,
        x="UMAP1",
        y="UMAP2",
        color="label",
        color_discrete_map={"Turkic": "red", "Other": "blue"},
        category_orders={"label": ["Other", "Turkic"]},  # <- Turkic plotted last ➜ on top
        title=f"UMAP | n_neighbors={n_neighbors}, min_dist={min_dist}, metric={metric}",
    )
    fig.update_traces(marker=dict(size=6, opacity=0.8))
    fig.show()



interact(
    run_umap,
    n_neighbors=IntSlider(value=15, min=5,  max=100, step=1),
    min_dist=FloatSlider(value=0.1, min=0.0, max=1.0, step=0.05),
    metric=Dropdown(options=['euclidean', 'cosine', 'manhattan'], value='euclidean')
);


interactive(children=(IntSlider(value=15, description='n_neighbors', min=5), FloatSlider(value=0.1, descriptio…

In [None]:

import numpy as np, pandas as pd, umap, plotly.express as px
from pathlib import Path
import imageio.v2 as iio          # imageio-ffmpeg backend

HERE = Path().resolve()
df_turkic = pd.read_csv(HERE / "turkicDefAncientScaledG25.csv",
                        index_col=0, header=None)
df_all    = pd.read_csv(HERE / "allAncientScaledG25.csv",
                        index_col=0, header=None)

df_turkic["label"] = "Turkic"
df_all["label"]    = "Other"
df = (pd.concat([df_turkic, df_all])
        .drop_duplicates()
        .reset_index(drop=True))

X = df.select_dtypes("number").to_numpy(dtype=np.float32)
y = df["label"].values

records = []

def add_frames(param, values, **umap_args):
    for v in values:
        reducer = umap_.UMAP(n_components=2, **{param: v}, **umap_args)
        coords  = reducer.fit_transform(X)
        tmp     = pd.DataFrame(coords, columns=["UMAP1", "UMAP2"])
        tmp["label"] = y
        tmp["frame"] = f"{param}={v}"
        records.append(tmp)

add_frames("n_neighbors", range(2, 101), min_dist=0.1, metric="euclidean")

add_frames("min_dist", np.linspace(0, 1, 41), n_neighbors=10,  metric="euclidean")

add_frames("min_dist", np.linspace(0, 1, 41), n_neighbors=100, metric="euclidean")

all_frames = pd.concat(records, ignore_index=True)

fig = px.scatter(
    all_frames, x="UMAP1", y="UMAP2",
    animation_frame="frame", animation_group=all_frames.index,
    color="label",
    color_discrete_map={"Turkic": "red", "Other": "blue"},
    category_orders={"label": ["Other", "Turkic"]},   
    title="UMAP parameter sweep (euclidean metric)",
)
fig.update_traces(marker=dict(size=6, opacity=0.85))
fig.update_layout(width=800, height=600)
fig.show()

out_dir = HERE / "umap_frames"
out_dir.mkdir(exist_ok=True)
png_paths = []

for i, frame_name in enumerate(fig.frames):          
    fig.update(frames=[frame_name])                  
    png = out_dir / f"frame_{i:04d}.png"
    fig.write_image(png, engine="kaleido", scale=2)  
    png_paths.append(png)

gif_path = HERE / "umap_sweep.gif"
iio.mimsave(gif_path, [iio.imread(p) for p in png_paths], duration=0.08)

mp4_path = HERE / "umap_sweep.mp4"
iio.mimwrite(mp4_path, [iio.imread(p) for p in png_paths],
             fps=12, codec="libx264", quality=None, pixelformat="yuv420p")

print("Saved:", gif_path, mp4_path)



Spectral initialisation failed! The eigenvector solver
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!


Spectral initialisation failed! The eigenvector solver
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!


Spectral initialisation failed! The eigenvector solver
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!

