In [9]:
import pandas as pd
from umap import umap_
import plotly.express as px
from pathlib import Path
import numpy as np
from ipywidgets import interact, IntSlider, FloatSlider, Dropdown

In [10]:



HERE = Path(__file__).parent.resolve() if "__file__" in globals() else Path().resolve()

df_turkic = pd.read_csv(HERE / "turkicDefAncientScaledG25.csv",
                        index_col=0, header=None)
df_all    = pd.read_csv(HERE / "allAncientScaledG25.csv",
                        index_col=0, header=None)




df_turkic['label'] = 'Turkic'  
df_all['label']    = 'Other'    

df = pd.concat([df_turkic, df_all]).drop_duplicates()
df['name'] = df.index  


X = df.drop(columns=['label', 'name']).values
y = df['label'].values



Different Models

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y_binary = (df['label'].values == 'Turkic').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, stratify=y_binary, test_size=0.2, random_state=42
)

clf = LogisticRegression(class_weight={0: 0.2, 1: 0.8}, max_iter=1000, random_state=32)
clf.fit(X_train, y_train)

probs = clf.predict_proba(X_test)[:, 1]

df['scorePU'] = clf.predict_proba(X)[:, 1]

df_rankedPU = df.sort_values(by='scorePU', ascending=False)

from sklearn.metrics import roc_auc_score, accuracy_score


In [12]:
from sklearn.preprocessing import normalize

turkic_vec = X[y_binary == 1].mean(axis=0)
X_norm = normalize(X)
turkic_vec_norm = turkic_vec / np.linalg.norm(turkic_vec)

cos_sim = X_norm @ turkic_vec_norm

df['scoreCOS'] = cos_sim
df_rankedCOS = df.sort_values(by='scoreCOS', ascending=False)


In [13]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

cluster_scores = df.groupby('cluster')['label'].apply(lambda x: (x == 'Turkic').mean())
df['scoreKM'] = df['cluster'].map(cluster_scores)

df_rankedKM = df.sort_values(by='scoreKM', ascending=False)






In [14]:
import torch
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Define embedding model
class EmbedNet(nn.Module):
    def __init__(self, input_dim, emb_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, emb_dim)
        )
    def forward(self, x):
        return self.net(x)

# Build triplet loader (no hard mining, but more samples)
def get_triplets(X, y, n=2048):
    pos = X[y == 1]
    neg = X[y == 0]
    triplets = []
    for _ in range(n):
        anchor = pos[np.random.randint(len(pos))]
        positive = pos[np.random.randint(len(pos))]
        negative = neg[np.random.randint(len(neg))]
        triplets.append((anchor, positive, negative))
    a, p, n = zip(*triplets)
    return torch.stack([torch.tensor(x, dtype=torch.float32) for x in a]), \
           torch.stack([torch.tensor(x, dtype=torch.float32) for x in p]), \
           torch.stack([torch.tensor(x, dtype=torch.float32) for x in n])

# Training
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y_binary)
model = EmbedNet(X.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.TripletMarginLoss(margin=1.0)

model.train()
for epoch in range(300):
    a, p, n = get_triplets(X, y_binary, n=512)
    optimizer.zero_grad()
    loss = criterion(model(a), model(p), model(n))
    loss.backward()
    optimizer.step()
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    emb_all = model(X_tensor).numpy()
    emb_all = normalize(emb_all)  

    emb_turkic = emb_all[y_binary == 1]
    proto = np.median(emb_turkic, axis=0).reshape(1, -1)  
    sims = cosine_similarity(emb_all, proto).flatten()

df['scoreNN'] = sims
df_rankedNN = df.sort_values(by='scoreNN', ascending=False)


Epoch 0, Loss: 0.9408
Epoch 50, Loss: 0.5547
Epoch 100, Loss: 0.2487
Epoch 150, Loss: 0.2273
Epoch 200, Loss: 0.1727
Epoch 250, Loss: 0.1769


Visualization

In [None]:
def run_umap(n_neighbors=15, min_dist=0.1, metric='euclidean', score_type='scorePU'):
    reducer = umap_.UMAP(
        n_components=2,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        random_state=42,  
        n_jobs=1
    )

    embedding = reducer.fit_transform(X)
    emb_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])

    emb_df['label'] = y
    emb_df['score'] = df[score_type].values  
    emb_df['name'] = df['name'].values

    fig = px.scatter(
        emb_df,
        x="UMAP1",
        y="UMAP2",
        color="score",
        hover_data=["name", "label"],
        color_continuous_scale="RdBu",
        title=f"UMAP | Colored by {score_type} | n_neighbors={n_neighbors}, min_dist={min_dist}, metric={metric}",
    )
    fig.update_traces(marker=dict(size=6, opacity=0.8))
    fig.show()

interact(
    run_umap,
    n_neighbors=IntSlider(value=15, min=5, max=100, step=1),
    min_dist=FloatSlider(value=0.1, min=0.0, max=1.0, step=0.05),
    metric=Dropdown(options=['euclidean', 'cosine', 'manhattan'], value='euclidean'),
    score_type=Dropdown(options=['scorePU', 'scoreKM', 'scoreNN', 'scoreCOS'], value='scorePU')
);


interactive(children=(IntSlider(value=15, description='n_neighbors', min=5), FloatSlider(value=0.1, descriptio…

In [18]:
from sklearn.manifold import TSNE

def run_tsne(perplexity=30, learning_rate=200, score_type='scorePU'):
    reducer = TSNE(
        n_components=2,
        perplexity=perplexity,
        learning_rate=learning_rate,
        random_state=42
    )

    embedding = reducer.fit_transform(X)
    emb_df = pd.DataFrame(embedding, columns=['TSNE1', 'TSNE2'])

    emb_df['label'] = y
    emb_df['score'] = df[score_type].values  
    emb_df['name'] = df['name'].values

    fig = px.scatter(
        emb_df,
        x="TSNE1",
        y="TSNE2",
        color="score",
        hover_data=["name", "label"],
        color_continuous_scale="RdBu",
        title=f"t-SNE | Colored by {score_type} | perplexity={perplexity}, learning_rate={learning_rate}",
    )
    fig.update_traces(marker=dict(size=6, opacity=0.8))
    fig.show()

interact(
    run_tsne,
    perplexity=IntSlider(value=30, min=5, max=50, step=1),
    learning_rate=IntSlider(value=200, min=10, max=1000, step=10),
    score_type=Dropdown(options=['scorePU', 'scoreKM', 'scoreNN', 'scoreCOS'], value='scorePU')
);


interactive(children=(IntSlider(value=30, description='perplexity', max=50, min=5), IntSlider(value=200, descr…