In [27]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier  
import keras_tuner as kt




df_turkic = pd.read_csv('~/turkicPCA/turkicDefAncientScaledG25.csv', header=None)
df_all = pd.read_csv('~/turkicPCA/allAncientScaledG25.csv', header=None)


df_turkic['label'] = 1
df_all['label'] = 0


df_combined = pd.concat([df_turkic, df_all]).drop_duplicates().reset_index(drop=True)



In [31]:


X = df_combined.iloc[:, :-1]
y = df_combined['label']

X_numeric = X.iloc[:, 1:]


X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test))

In [18]:
def build_model(hp):
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Dense(units=hp.Int('units', min_value=32, max_value=512, step=32), activation='relu'))
    
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [19]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of hyperparameter combinations to try
    executions_per_trial=3,  # Number of models to train per combination
    directory='my_dir',
    project_name='hyperparam_tuning'
)

tuner.search(X_train_scaled, y_train, epochs=5, validation_data=(X_test_scaled, y_test))

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best units: {best_hps.get('units')}, Best learning rate: {best_hps.get('learning_rate')}")

model = tuner.hypermodel.build(best_hps)

history = model.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test))



Reloading Tuner from my_dir/hyperparam_tuning/tuner0.json
Best units: 128, Best learning rate: 0.0001
Epoch 1/50
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 615us/step - accuracy: 0.8232 - loss: 0.4864 - val_accuracy: 0.9809 - val_loss: 0.1446
Epoch 2/50
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 362us/step - accuracy: 0.9861 - loss: 0.1147 - val_accuracy: 0.9809 - val_loss: 0.0884
Epoch 3/50
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357us/step - accuracy: 0.9850 - loss: 0.0764 - val_accuracy: 0.9809 - val_loss: 0.0755
Epoch 4/50
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374us/step - accuracy: 0.9875 - loss: 0.0559 - val_accuracy: 0.9809 - val_loss: 0.0694
Epoch 5/50
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 378us/step - accuracy: 0.9897 - loss: 0.0446 - val_accuracy: 0.9809 - val_loss: 0.0650
Epoch 6/50
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3

In [22]:
# Evaluate on test data
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {test_acc}')


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239us/step - accuracy: 0.9781 - loss: 0.0734
Test Accuracy: 0.9809393882751465


In [23]:
import datetime


log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


model.fit(train_dataset.batch(32),
          epochs=10,
          validation_data=test_dataset.batch(32),
          callbacks=[tensorboard_callback])


Epoch 1/10


[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step - accuracy: 0.9863 - loss: 0.0341 - val_accuracy: 0.9809 - val_loss: 0.0629
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 462us/step - accuracy: 0.9863 - loss: 0.0339 - val_accuracy: 0.9809 - val_loss: 0.0633
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424us/step - accuracy: 0.9863 - loss: 0.0338 - val_accuracy: 0.9809 - val_loss: 0.0636
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 427us/step - accuracy: 0.9863 - loss: 0.0337 - val_accuracy: 0.9809 - val_loss: 0.0639
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453us/step - accuracy: 0.9863 - loss: 0.0335 - val_accuracy: 0.9809 - val_loss: 0.0642
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461us/step - accuracy: 0.9863 - loss: 0.0334 - val_accuracy: 0.9809 - val_loss: 0.0645
Epoch 7/10
[1m184/184[0m 

<keras.src.callbacks.history.History at 0x348b91b20>

In [24]:

def predict_turkic_and_save_csv(csv_file, model, scaler, output_file, threshold=0.3):
    df = pd.read_csv(csv_file, header=None)
    X_new = df.iloc[:, 1:]  # Exclude non-numeric data


    X_new_scaled = scaler.transform(X_new)


    probabilities = model.predict(X_new_scaled)


    predictions_labels = ["Turkic" if prob > 0.09624789 else "Not Turkic" for prob in probabilities]


    df['Turkic_Probability'] = probabilities
    df['Prediction'] = predictions_labels


    df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")


input_csv = '~/turkicPCA/allModernScaledG25.csv'
output_csv = '~/turkicPCA/turkicPredictAllModernScaledG25.csv'
predict_turkic_and_save_csv(input_csv, model, scaler, output_csv)
input_csv = '~/turkicPCA/allAncientScaledG25.csv'
output_csv = '~/turkicPCA/turkicPredictAllAncientScaledG25.csv'
predict_turkic_and_save_csv(input_csv, model, scaler, output_csv)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608us/step
Predictions saved to ~/turkicPCA/turkicPredictAllModernScaledG25.csv
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177us/step
Predictions saved to ~/turkicPCA/turkicPredictAllAncientScaledG25.csv


In [26]:
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

ancient_file_path = '~/turkicPCA/turkicPredictAllAncientScaledG25.csv'
modern_file_path = '~/turkicPCA/turkicModernScaledG25.csv'

df_ancient = pd.read_csv(ancient_file_path)
df_modern = pd.read_csv(modern_file_path)

df_modern['Prediction'] = 'Modern Turkic'

df = pd.concat([df_ancient, df_modern])

features = df.columns[1:25]

tsne = TSNE(n_components=2, random_state=3125)
tsne_results = tsne.fit_transform(df[features])

df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]

def determine_color(row):
    if row['Prediction'] == 'Modern Turkic':
        return 'yellow'
    elif row['Turkic_Probability'] < 0.04:
        return 'black'
    else:
        max_prob = df.loc[df['Prediction'] == 'Not Turkic', 'Turkic_Probability'].max()
        min_prob = df.loc[df['Prediction'] == 'Not Turkic', 'Turkic_Probability'].min()
        norm_prob = (row['Turkic_Probability'] - min_prob) / (max_prob - min_prob)
        red_intensity = int(255 * norm_prob)
        return f'rgb({red_intensity},0,0)'

df['color'] = df.apply(determine_color, axis=1)

hover_name = df.columns[0]

fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two',
                 color='color', color_discrete_map="identity",
                 hover_name=hover_name)

fig.show()


In [13]:
import os

output_file_path = os.path.expanduser('~/turkicPCA/turkic_tsne_plot.html')

fig.write_html(output_file_path)

print(f"Plot saved to {output_file_path}")


Plot saved to /Users/asami/turkicPCA/turkic_tsne_plot.html
