In [43]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd


df_turkic = pd.read_csv('~/turkicPCA/turkicDefAncientScaledG25.csv', header=None)
df_all = pd.read_csv('~/turkicPCA/allAncientScaledG25.csv', header=None)


df_turkic['label'] = 1
df_all['label'] = 0


df_combined = pd.concat([df_turkic, df_all]).drop_duplicates().reset_index(drop=True)


X = df_combined.iloc[:, :-1]
y = df_combined['label']

X_numeric = X.iloc[:, 1:]


X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test))

In [44]:

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


model = create_model()


In [45]:
import datetime


log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


model.fit(train_dataset.batch(32),
          epochs=10,
          validation_data=test_dataset.batch(32),
          callbacks=[tensorboard_callback])


Epoch 1/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 644us/step - accuracy: 0.9779 - loss: 0.1425 - val_accuracy: 0.9809 - val_loss: 0.0635
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417us/step - accuracy: 0.9863 - loss: 0.0486 - val_accuracy: 0.9809 - val_loss: 0.0623
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410us/step - accuracy: 0.9863 - loss: 0.0451 - val_accuracy: 0.9809 - val_loss: 0.0639
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 409us/step - accuracy: 0.9863 - loss: 0.0431 - val_accuracy: 0.9809 - val_loss: 0.0657
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404us/step - accuracy: 0.9863 - loss: 0.0415 - val_accuracy: 0.9809 - val_loss: 0.0673
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410us/step - accuracy: 0.9863 - loss: 0.0404 - val_accuracy: 0.9809 - val_loss: 0.0689
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x3119d0e30>

In [46]:

def predict_turkic_and_save_csv(csv_file, model, scaler, output_file, threshold=0.3):
    df = pd.read_csv(csv_file, header=None)
    X_new = df.iloc[:, 1:]  # Exclude non-numeric data


    X_new_scaled = scaler.transform(X_new)


    probabilities = model.predict(X_new_scaled)


    predictions_labels = ["Turkic" if prob > 0.09624789 else "Not Turkic" for prob in probabilities]


    df['Turkic_Probability'] = probabilities
    df['Prediction'] = predictions_labels


    df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")


input_csv = '~/turkicPCA/allModernScaledG25.csv'
output_csv = '~/turkicPCA/turkicPredictAllModernScaledG25.csv'
predict_turkic_and_save_csv(input_csv, model, scaler, output_csv)
input_csv = '~/turkicPCA/allAncientScaledG25.csv'
output_csv = '~/turkicPCA/turkicPredictAllAncientScaledG25.csv'
predict_turkic_and_save_csv(input_csv, model, scaler, output_csv)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 584us/step
Predictions saved to ~/turkicPCA/turkicPredictAllModernScaledG25.csv
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170us/step
Predictions saved to ~/turkicPCA/turkicPredictAllAncientScaledG25.csv


In [1]:
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px

csv_file_path = '~/turkicPCA/turkicPredictAllAncientScaledG25.csv'

df = pd.read_csv(csv_file_path)

features = df.columns[1:25]  

tsne = TSNE(n_components=2, random_state=35)
tsne_results = tsne.fit_transform(df[features])

df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]

hover_name = df.columns[0]

color_map = {'Turkic': 'red', 'Not Turkic': 'blue'}  

fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color='Prediction',
                 color_discrete_map=color_map, hover_name=hover_name)

fig.show()


In [5]:
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px

# File paths
ancient_file_path = '~/turkicPCA/turkicPredictAllAncientScaledG25.csv'
modern_file_path = '~/turkicPCA/turkicModernScaledG25.csv'

# Read the datasets
df_ancient = pd.read_csv(ancient_file_path)
df_modern = pd.read_csv(modern_file_path)

# Add a new column to mark the 'Prediction' for modern data
df_modern['Prediction'] = 'Modern Turkic'

# Concatenate ancient and modern dataframes
df = pd.concat([df_ancient, df_modern])

# Select the features for t-SNE
features = df.columns[1:25]

# Run t-SNE
tsne = TSNE(n_components=2, random_state=35)
tsne_results = tsne.fit_transform(df[features])

# Add the t-SNE results back to the dataframe
df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]

# Use the first column of the dataframe as the hover name assuming it contains the sample names
hover_name = df.columns[0]

# Define the color map
color_map = {'Turkic': 'red', 'Not Turkic': 'blue', 'Modern Turkic': 'yellow'}

# Create the plot
fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color='Prediction',
                 color_discrete_map=color_map, hover_name=hover_name)

# Show the plot
fig.show()


In [8]:
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

# File paths
ancient_file_path = '~/turkicPCA/turkicPredictAllAncientScaledG25.csv'
modern_file_path = '~/turkicPCA/turkicModernScaledG25.csv'

# Read the datasets
df_ancient = pd.read_csv(ancient_file_path)
df_modern = pd.read_csv(modern_file_path)

# Add a new column to mark the 'Prediction' for modern data
df_modern['Prediction'] = 'Modern Turkic'

# Concatenate ancient and modern dataframes
df = pd.concat([df_ancient, df_modern])

# Select the features for t-SNE
features = df.columns[1:25]

# Run t-SNE
tsne = TSNE(n_components=2, random_state=35)
tsne_results = tsne.fit_transform(df[features])

# Add the t-SNE results back to the dataframe
df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]

# Define the color gradient function
def determine_color(row):
    if row['Prediction'] == 'Modern Turkic':
        return 'yellow'
    elif row['Turkic_Probability'] < 0.04:
        return 'black'
    else:
        max_prob = df.loc[df['Prediction'] == 'Not Turkic', 'Turkic_Probability'].max()
        min_prob = df.loc[df['Prediction'] == 'Not Turkic', 'Turkic_Probability'].min()
        norm_prob = (row['Turkic_Probability'] - min_prob) / (max_prob - min_prob)
        red_intensity = int(255 * norm_prob)
        return f'rgb({red_intensity},0,0)'

# Apply the color determination function
df['color'] = df.apply(determine_color, axis=1)

# Use the first column of the dataframe as the hover name assuming it contains the sample names
hover_name = df.columns[0]

# Create the plot
fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two',
                 color='color', color_discrete_map="identity",
                 hover_name=hover_name)

# Show the plot
fig.show()
