<a href="https://colab.research.google.com/github/Yusuprozimemet/GPS-Uyghur/blob/main/Saka.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import io
import csv
from math import sqrt

def calculate_saka_similarity(sample, khotanese, weights):
    # Components we're considering for similarity
    all_components = ['Early_Bronze_Age_Europe_Indoeuropean', 'Zagros_Neolithic_Farmers', 'Siberian_Hunter_Gatherer', 'Caucasus_Hunter_Gatherer']
    shared_components = [comp for comp in all_components if comp in sample and comp in khotanese]

    # Calculate Euclidean distance for shared components, weighted by importance
    distance = sum(weights[comp] * (sample[comp] - khotanese[comp])**2 for comp in shared_components)
    euclidean_distance = sqrt(distance)

    # Normalize based on maximum possible distance for these components (if they were completely different)
    max_distance = sqrt(sum(weights[comp]**2 * 100**2 for comp in shared_components))
    similarity_score = 1 - (euclidean_distance / max_distance)

    # Adjust for non-shared components by reducing the score based on their presence in the sample
    non_shared = set(sample.keys()) - set(shared_components)
    adjustment_factor = 1 - (sum(sample[comp] for comp in non_shared) / 100)  # Assuming total is 100%
    adjusted_similarity = similarity_score * adjustment_factor

    return adjusted_similarity

def main(data):
    csv_reader = csv.DictReader(io.StringIO(data))

    # Extract Khotanese sample as reference
    khotanese = None
    for row in csv_reader:
        if row['Target'] == 'Khotanese':
            khotanese = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']}
            break

    if khotanese is None:
        raise ValueError("Khotanese sample not found in data.")

    # Weights for components (example values; adjust based on domain knowledge)
    weights = {
        'Early_Bronze_Age_Europe_Indoeuropean': 1.0,
        'Zagros_Neolithic_Farmers': 1.0,
        'Siberian_Hunter_Gatherer': 1.0,
        'Caucasus_Hunter_Gatherer': 1.0
    }

    # Reset csv_reader to start from the beginning
    csv_reader = csv.DictReader(io.StringIO(data))

    # Calculate Saka components for each sample
    for row in csv_reader:
        if row['Target'] != 'Khotanese':  # Skip Khotanese itself
            sample = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']}
            saka_score = calculate_saka_similarity(sample, khotanese, weights)
            print(f"{row['Target']}: Saka Similarity Score = {saka_score:.2%}")

if __name__ == "__main__":
    data="""Target,Distance,Early_Bronze_Age_Europe_Indoeuropean,Zagros_Neolithic_Farmers,Siberian_Hunter_Gatherer,Caucasus_Hunter_Gatherer,Russia_MA1_HG.SG_ANE,Amerindian,Anatolia_Neolithic_Farmers,Western_European_Hunter_Gatherer,North_Africa_Epipaleolithic,Levant_Epipaleolithic,Yoruba
yusupancentry,0.09729617,29.0,11.0,24.8,3.8,0.0,1.6,16.8,4.2,0.0,8.2,0.6
Khotanese,0.03092311,48.4,25.2,17.8,5.2,2.2,1.2,0.0,0.0,0.0,0.0,0.0
Uygur:Representative_Sample,0.07603779,5.0,18.6,54.0,6.2,0.0,0.0,12.8,3.2,0.2,0.0,0.0
Average,0.06808569,27.5,18.3,32.2,5.1,0.7,0.9,9.9,2.5,0.1,2.7,0.2
"""
    main(data)

yusupancentry: Saka Similarity Score = 60.00%
Uygur:Representative_Sample: Saka Similarity Score = 59.96%
Average: Saka Similarity Score = 72.08%


In [43]:
import io
import csv
import numpy as np
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_func(vector1, vector2):
    v1 = np.array(list(vector1.values())).reshape(1, -1)
    v2 = np.array(list(vector2.values())).reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

def euclidean_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.euclidean(v1, v2) / np.sqrt(len(v1) * 100**2)  # Normalize to [0, 1]

def manhattan_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.cityblock(v1, v2) / (len(v1) * 100)  # Normalize to [0, 1]

def chebyshev_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.chebyshev(v1, v2) / 100  # Normalize to [0, 1]

def jaccard_similarity_func(vector1, vector2):
    v1 = set(vector1.keys())
    v2 = set(vector2.keys())
    return len(v1 & v2) / len(v1 | v2)  # Intersection over Union

def bray_curtis_similarity_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.braycurtis(v1, v2)  # Convert dissimilarity to similarity

def canberra_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.canberra(v1, v2) / len(v1)  # Normalize to [0, 1]

def correlation_coefficient_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return np.corrcoef(v1, v2)[0, 1]  # Pearson's r

def minkowski_distance_func(vector1, vector2, p=3):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.minkowski(v1, v2, p) / (len(v1) * 100**(1/p))  # Normalize to [0, 1]

def hamming_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - (np.count_nonzero(v1 - v2) / len(v1))  # Proportion of differing elements

def main(data):
    csv_reader = csv.DictReader(io.StringIO(data))

    # Extract Ancient Israelite and average Ashkenazi samples as references
    ancient_israelite = None
    ashkenazi_samples = []
    for row in csv_reader:
        if row['Target'] == 'Ancient_Israelite':
            ancient_israelite = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']}
        elif 'Ashkenazi' in row['Target']:
            ashkenazi_samples.append({k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']})

    if ancient_israelite is None or not ashkenazi_samples:
        raise ValueError("Ancient Israelite or Ashkenazi samples not found in data.")

    # Calculate average Ashkenazi profile
    avg_ashkenazi = {}
    for key in ashkenazi_samples[0].keys():
        avg_ashkenazi[key] = sum(sample[key] for sample in ashkenazi_samples) / len(ashkenazi_samples)

    # Methods for similarity calculation
    methods = {
        'Cosine Similarity': cosine_similarity_func,
        'Euclidean Distance': euclidean_distance_func,
        'Manhattan Distance': manhattan_distance_func,
        'Chebyshev Distance': chebyshev_distance_func,
        'Jaccard Similarity': jaccard_similarity_func,
        'Bray-Curtis Similarity': bray_curtis_similarity_func,
        'Canberra Distance': canberra_distance_func,
        'Correlation Coefficient': correlation_coefficient_func,
        'Minkowski Distance': minkowski_distance_func,
        'Hamming Distance': hamming_distance_func,
    }

    # Reset csv_reader to start from the beginning
    csv_reader = csv.DictReader(io.StringIO(data))

    # Calculate similarities for each sample
    for row in csv_reader:
        if row['Target'] != 'Ancient_Israelite' and 'Ashkenazi' not in row['Target']:  # Skip reference samples
            sample = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']}

            print(f"{row['Target']}:")
            for method_name, method in methods.items():
                israelite_sim = method(sample, ancient_israelite)
                ashkenazi_sim = method(sample, avg_ashkenazi)
                print(f"  {method_name}:")
                print(f"    Ancient Israelite = {israelite_sim:.2%}")
                print(f"    Ashkenazi = {ashkenazi_sim:.2%}")

if __name__ == "__main__":
    data = """Target,Distance,Anatolia_Neolithic_Farmers,Caucasus_Hunter_Gatherer,Eastern_European_Hunter_Gatherer,Natufian_Hunter_Gatherer,North_African_Neolithic_Farmer,Siberian_Hunter_Gatherer_ANE,Zagros_Neolithic_Farmers
Ancient_Israelite,0.02534301,49.8,11.6,0.0,22.2,0.0,0.0,16.4
yusupancentry,0.10122330,26.0,9.6,18.2,6.0,1.0,26.4,12.8
Ashkenazi_Austria:Austrian_Jew_ashkenazy2w,0.03792004,50.4,11.4,11.2,14.2,0.0,1.2,11.6
Ashkenazi_Austria:Austrian_Jew_ashkenazy3w,0.03413616,44.2,11.6,12.8,17.6,0.0,0.6,13.2
Ashkenazi_Belarussia:Ashk_BY_BY_1,0.03536681,50.6,10.2,14.6,9.4,0.0,1.6,13.6
Ashkenazi_Belarussia:Ashk_BY_BY_10,0.03466870,49.4,12.8,15.0,11.4,0.0,2.0,9.4
Ashkenazi_Belarussia:Ashk_BY_BY_8,0.03860332,47.8,16.4,12.2,11.8,1.4,1.8,8.6
Ashkenazi_Belarussia:Ashk_BY_BY_9,0.03848156,43.4,9.8,14.6,18.2,0.0,1.6,12.4
Ashkenazi_France:ashkenazy4w,0.02649095,48.6,11.4,10.8,16.0,0.0,0.6,12.6
Ashkenazi_France:FranceJewF38,0.04022400,50.4,9.0,11.2,14.8,0.0,0.0,14.6
Ashkenazi_France:GRC12118117,0.02783109,51.4,10.2,10.6,15.6,0.6,0.0,11.6
Ashkenazi_Germany:Ashk_DE_DE_1,0.02414584,49.4,10.4,9.0,17.4,0.0,0.0,13.8
Ashkenazi_Germany:Ashk_DE_DE_10,0.03561125,50.6,10.2,11.6,13.6,1.0,0.8,12.2
Ashkenazi_Germany:Ashk_DE_DE_9,0.03922654,48.2,15.6,9.6,16.2,0.0,0.2,10.2
Ashkenazi_Latvia:Latvian_Jew_ashkenazy3e,0.03567490,48.0,12.8,11.4,13.6,0.2,0.2,13.8
Ashkenazi_Latvia:Latvian_Jew_ashkenazy5e,0.04101438,46.4,11.8,10.8,15.6,0.0,2.2,13.2
Ashkenazi_Lithuania:Ashk_LT_LT_1,0.03465352,45.8,10.0,15.0,12.6,0.8,2.0,13.8
Ashkenazi_Lithuania:Ashk_LT_LT_8,0.03487516,43.8,12.0,12.4,17.0,0.0,0.8,14.0
Ashkenazi_Lithuania:Ashk_LT_LT_9,0.03562080,48.6,12.2,12.0,12.6,0.0,1.6,13.0
Ashkenazi_Poland:Ashk_PL_PL_1,0.03490842,46.4,14.4,13.4,13.4,0.0,2.4,10.0
Ashkenazi_Poland:Ashk_PL_PL_10,0.03568202,49.0,7.2,13.0,13.0,0.0,0.6,17.2
Ashkenazi_Russia:Ashk_RU_RU_1,0.03826277,47.0,12.4,11.6,13.8,0.0,1.8,13.4
Ashkenazi_Russia:Ashk_RU_RU_10,0.03458646,44.4,14.4,14.8,13.0,0.0,2.0,11.4
Ashkenazi_Russia:Ashk_RU_RU_11,0.03047460,52.2,11.6,12.8,9.8,1.8,2.0,9.8
Ashkenazi_Russia:Ashk_RU_RU_2,0.03698448,47.6,12.6,15.0,11.4,0.0,1.0,12.4
Ashkenazi_Ukraine:Ashk_UA_UA_8,0.04348505,48.0,14.0,14.8,10.2,1.4,2.0,9.6
Ashkenazi_Ukraine:Ashk_UA_UA_9,0.03320646,50.2,10.4,13.8,11.8,0.0,0.6,13.2
Uygur:HGDP01300,0.06349086,14.0,6.2,7.4,0.0,0.0,51.0,21.4
Han_Fujian:Han1467,0.28519539,0.0,0.0,0.0,0.2,3.2,96.6,0.0
Han_Fujian:Han1619,0.27857088,0.0,0.0,0.0,1.0,3.0,96.0,0.0
Han_Fujian:Han1900,0.26978274,0.0,0.0,0.0,0.0,2.4,97.6,0.0
Han_Fujian:Han1934,0.27946403,0.0,0.0,0.0,0.8,1.6,97.6,0.0
Han_Fujian:Han1994,0.27752822,0.0,0.0,0.0,0.4,2.6,97.0,0.0


"""
    main(data)

yusupancentry:
  Cosine Similarity:
    Ancient Israelite = 67.36%
    Ashkenazi = 76.77%
  Euclidean Distance:
    Ancient Israelite = 83.63%
    Ashkenazi = 86.80%
  Manhattan Distance:
    Ancient Israelite = 86.97%
    Ashkenazi = 90.85%
  Chebyshev Distance:
    Ancient Israelite = 73.60%
    Ashkenazi = 74.78%
  Jaccard Similarity:
    Ancient Israelite = 100.00%
    Ashkenazi = 100.00%
  Bray-Curtis Similarity:
    Ancient Israelite = 54.40%
    Ashkenazi = 67.98%
  Canberra Distance:
    Ancient Israelite = 41.34%
    Ashkenazi = 64.83%
  Correlation Coefficient:
    Ancient Israelite = 30.47%
    Ashkenazi = 46.78%
  Minkowski Distance:
    Ancient Israelite = -7.17%
    Ashkenazi = 7.16%
  Hamming Distance:
    Ancient Israelite = 0.00%
    Ashkenazi = 0.00%
Uygur:HGDP01300:
  Cosine Similarity:
    Ancient Israelite = 33.31%
    Ashkenazi = 37.02%
  Euclidean Distance:
    Ancient Israelite = 74.69%
    Ashkenazi = 76.18%
  Manhattan Distance:
    Ancient Israelite = 81.89%


#save as csv

In [46]:
import io
import csv
import numpy as np
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_func(vector1, vector2):
    v1 = np.array(list(vector1.values())).reshape(1, -1)
    v2 = np.array(list(vector2.values())).reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

def euclidean_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.euclidean(v1, v2) / np.sqrt(len(v1) * 100**2)  # Normalize to [0, 1]

def manhattan_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.cityblock(v1, v2) / (len(v1) * 100)  # Normalize to [0, 1]

def chebyshev_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.chebyshev(v1, v2) / 100  # Normalize to [0, 1]

def jaccard_similarity_func(vector1, vector2):
    v1 = set(vector1.keys())
    v2 = set(vector2.keys())
    return len(v1 & v2) / len(v1 | v2)  # Intersection over Union

def bray_curtis_similarity_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.braycurtis(v1, v2)  # Convert dissimilarity to similarity

def canberra_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.canberra(v1, v2) / len(v1)  # Normalize to [0, 1]

def correlation_coefficient_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return np.corrcoef(v1, v2)[0, 1]  # Pearson's r

def minkowski_distance_func(vector1, vector2, p=3):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.minkowski(v1, v2, p) / (len(v1) * 100**(1/p))  # Normalize to [0, 1]

def main(data):
    csv_reader = csv.DictReader(io.StringIO(data))

    # Extract Ancient Israelite and average Ashkenazi samples as references
    ancient_israelite = None
    ashkenazi_samples = []
    for row in csv_reader:
        if row['Target'] == 'Ancient_Israelite':
            ancient_israelite = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']}
        elif 'Ashkenazi' in row['Target']:
            ashkenazi_samples.append({k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']})

    if ancient_israelite is None or not ashkenazi_samples:
        raise ValueError("Ancient Israelite or Ashkenazi samples not found in data.")

    # Calculate average Ashkenazi profile
    avg_ashkenazi = {}
    for key in ashkenazi_samples[0].keys():
        avg_ashkenazi[key] = sum(sample[key] for sample in ashkenazi_samples) / len(ashkenazi_samples)

    # Methods for similarity calculation
    methods = {
        'Cosine Similarity': cosine_similarity_func,
        'Euclidean Distance': euclidean_distance_func,
        'Manhattan Distance': manhattan_distance_func,
        'Chebyshev Distance': chebyshev_distance_func,
        'Jaccard Similarity': jaccard_similarity_func,
        'Bray-Curtis Similarity': bray_curtis_similarity_func,
        'Canberra Distance': canberra_distance_func,
        'Correlation Coefficient': correlation_coefficient_func,
        'Minkowski Distance': minkowski_distance_func,
    }

    # Reset csv_reader to start from the beginning
    csv_reader = csv.DictReader(io.StringIO(data))

    # Open a new CSV file to write results
    with open('similarity_results.csv', 'w', newline='') as csvfile:
        fieldnames = ['Sample', 'Method', 'Ancient Israelite', 'Ashkenazi']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Calculate similarities for each sample
        for row in csv_reader:
            if row['Target'] != 'Ancient_Israelite' and 'Ashkenazi' not in row['Target']:  # Skip reference samples
                sample = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance'] and v not in [None, '', 'NA']}

                # Skip samples with no valid features
                if not sample:
                    print(f"Skipping sample {row['Target']} due to missing or invalid feature data.")
                    continue

                for method_name, method in methods.items():
                    try:
                        israelite_sim = method(sample, ancient_israelite)
                        ashkenazi_sim = method(sample, avg_ashkenazi)
                        writer.writerow({
                            'Sample': row['Target'],
                            'Method': method_name,
                            'Ancient Israelite': f"{israelite_sim:.2%}",
                            'Ashkenazi': f"{ashkenazi_sim:.2%}"
                        })
                    except Exception as e:
                        print(f"Error processing {row['Target']} with {method_name}: {e}")



if __name__ == "__main__":
    data = """Target,Distance,Anatolia_Neolithic_Farmers,Caucasus_Hunter_Gatherer,Eastern_European_Hunter_Gatherer,Natufian_Hunter_Gatherer,North_African_Neolithic_Farmer,Siberian_Hunter_Gatherer_ANE,Zagros_Neolithic_Farmers
Ancient_Israelite,0.02534301,49.8,11.6,0.0,22.2,0.0,0.0,16.4
yusupancentry,0.10122330,26.0,9.6,18.2,6.0,1.0,26.4,12.8
Ashkenazi_Austria:Austrian_Jew_ashkenazy2w,0.03792004,50.4,11.4,11.2,14.2,0.0,1.2,11.6
Ashkenazi_Austria:Austrian_Jew_ashkenazy3w,0.03413616,44.2,11.6,12.8,17.6,0.0,0.6,13.2
Ashkenazi_Belarussia:Ashk_BY_BY_1,0.03536681,50.6,10.2,14.6,9.4,0.0,1.6,13.6
Ashkenazi_Belarussia:Ashk_BY_BY_10,0.03466870,49.4,12.8,15.0,11.4,0.0,2.0,9.4
Ashkenazi_Belarussia:Ashk_BY_BY_8,0.03860332,47.8,16.4,12.2,11.8,1.4,1.8,8.6
Ashkenazi_Belarussia:Ashk_BY_BY_9,0.03848156,43.4,9.8,14.6,18.2,0.0,1.6,12.4
Ashkenazi_France:ashkenazy4w,0.02649095,48.6,11.4,10.8,16.0,0.0,0.6,12.6
Ashkenazi_France:FranceJewF38,0.04022400,50.4,9.0,11.2,14.8,0.0,0.0,14.6
Ashkenazi_France:GRC12118117,0.02783109,51.4,10.2,10.6,15.6,0.6,0.0,11.6
Ashkenazi_Germany:Ashk_DE_DE_1,0.02414584,49.4,10.4,9.0,17.4,0.0,0.0,13.8
Ashkenazi_Germany:Ashk_DE_DE_10,0.03561125,50.6,10.2,11.6,13.6,1.0,0.8,12.2
Ashkenazi_Germany:Ashk_DE_DE_9,0.03922654,48.2,15.6,9.6,16.2,0.0,0.2,10.2
Ashkenazi_Latvia:Latvian_Jew_ashkenazy3e,0.03567490,48.0,12.8,11.4,13.6,0.2,0.2,13.8
Ashkenazi_Latvia:Latvian_Jew_ashkenazy5e,0.04101438,46.4,11.8,10.8,15.6,0.0,2.2,13.2
Ashkenazi_Lithuania:Ashk_LT_LT_1,0.03465352,45.8,10.0,15.0,12.6,0.8,2.0,13.8
Ashkenazi_Lithuania:Ashk_LT_LT_8,0.03487516,43.8,12.0,12.4,17.0,0.0,0.8,14.0
Ashkenazi_Lithuania:Ashk_LT_LT_9,0.03562080,48.6,12.2,12.0,12.6,0.0,1.6,13.0
Ashkenazi_Poland:Ashk_PL_PL_1,0.03490842,46.4,14.4,13.4,13.4,0.0,2.4,10.0
Ashkenazi_Poland:Ashk_PL_PL_10,0.03568202,49.0,7.2,13.0,13.0,0.0,0.6,17.2
Ashkenazi_Russia:Ashk_RU_RU_1,0.03826277,47.0,12.4,11.6,13.8,0.0,1.8,13.4
Ashkenazi_Russia:Ashk_RU_RU_10,0.03458646,44.4,14.4,14.8,13.0,0.0,2.0,11.4
Ashkenazi_Russia:Ashk_RU_RU_11,0.03047460,52.2,11.6,12.8,9.8,1.8,2.0,9.8
Ashkenazi_Russia:Ashk_RU_RU_2,0.03698448,47.6,12.6,15.0,11.4,0.0,1.0,12.4
Ashkenazi_Ukraine:Ashk_UA_UA_8,0.04348505,48.0,14.0,14.8,10.2,1.4,2.0,9.6
Ashkenazi_Ukraine:Ashk_UA_UA_9,0.03320646,50.2,10.4,13.8,11.8,0.0,0.6,13.2
Uygur:HGDP01300,0.06349086,14.0,6.2,7.4,0.0,0.0,51.0,21.4
Han_Fujian:Han1467,0.28519539,0.0,0.0,0.0,0.2,3.2,96.6,0.0
Han_Fujian:Han1619,0.27857088,0.0,0.0,0.0,1.0,3.0,96.0,0.0
Han_Fujian:Han1900,0.26978274,0.0,0.0,0.0,0.0,2.4,97.6,0.0
Han_Fujian:Han1934,0.27946403,0.0,0.0,0.0,0.8,1.6,97.6,0.0
Han_Fujian:Han1994,0.27752822,0.0,0.0,0.0,0.4,2.6,97.0,0.0
    """
    main(data)

Skipping sample      due to missing or invalid feature data.


In [52]:
import pandas as pd
import numpy as np
import plotly.express as px
from io import StringIO

data = """Sample,Method,Ancient Israelite,Ashkenazi
yusupancentry,Cosine Similarity,67.36%,76.77%
yusupancentry,Euclidean Distance,83.63%,86.80%
yusupancentry,Manhattan Distance,86.97%,90.85%
yusupancentry,Chebyshev Distance,73.60%,74.78%
yusupancentry,Jaccard Similarity,100.00%,100.00%
yusupancentry,Bray-Curtis Similarity,54.40%,67.98%
yusupancentry,Canberra Distance,41.34%,64.83%
yusupancentry,Correlation Coefficient,30.47%,46.78%
yusupancentry,Minkowski Distance,-7.17%,7.16%
Uygur:HGDP01300,Cosine Similarity,33.31%,37.02%
Uygur:HGDP01300,Euclidean Distance,74.69%,76.18%
Uygur:HGDP01300,Manhattan Distance,81.89%,83.18%
Uygur:HGDP01300,Chebyshev Distance,49.00%,50.18%
Uygur:HGDP01300,Jaccard Similarity,100.00%,100.00%
Uygur:HGDP01300,Bray-Curtis Similarity,36.60%,41.13%
Uygur:HGDP01300,Canberra Distance,42.90%,37.98%
Uygur:HGDP01300,Correlation Coefficient,-15.95%,-15.50%
Uygur:HGDP01300,Minkowski Distance,-76.99%,-69.43%
Han_Fujian:Han1467,Cosine Similarity,0.08%,2.25%
Han_Fujian:Han1467,Euclidean Distance,57.39%,58.50%
Han_Fujian:Han1467,Manhattan Distance,71.49%,71.91%
Han_Fujian:Han1467,Chebyshev Distance,3.40%,4.58%
Han_Fujian:Han1467,Jaccard Similarity,100.00%,100.00%
Han_Fujian:Han1467,Bray-Curtis Similarity,0.20%,1.67%
Han_Fujian:Han1467,Canberra Distance,14.54%,3.11%
Han_Fujian:Han1467,Correlation Coefficient,-36.27%,-37.76%
Han_Fujian:Han1467,Minkowski Distance,-211.99%,-206.52%
Han_Fujian:Han1619,Cosine Similarity,0.40%,2.46%
Han_Fujian:Han1619,Euclidean Distance,57.64%,58.73%
Han_Fujian:Han1619,Manhattan Distance,71.71%,72.13%
Han_Fujian:Han1619,Chebyshev Distance,4.00%,5.18%
Han_Fujian:Han1619,Jaccard Similarity,100.00%,100.00%
Han_Fujian:Han1619,Bray-Curtis Similarity,1.00%,2.47%
Han_Fujian:Han1619,Canberra Distance,15.52%,4.79%
Han_Fujian:Han1619,Correlation Coefficient,-36.09%,-37.74%
Han_Fujian:Han1619,Minkowski Distance,-210.20%,-204.78%
Han_Fujian:Han1900,Cosine Similarity,0.00%,2.19%
Han_Fujian:Han1900,Euclidean Distance,57.06%,58.17%
Han_Fujian:Han1900,Manhattan Distance,71.43%,71.85%
Han_Fujian:Han1900,Chebyshev Distance,2.40%,3.58%
Han_Fujian:Han1900,Jaccard Similarity,100.00%,100.00%
Han_Fujian:Han1900,Bray-Curtis Similarity,0.00%,1.47%
Han_Fujian:Han1900,Canberra Distance,14.29%,3.40%
Han_Fujian:Han1900,Correlation Coefficient,-35.96%,-37.36%
Han_Fujian:Han1900,Minkowski Distance,-214.81%,-209.36%
Han_Fujian:Han1934,Cosine Similarity,0.31%,2.40%
Han_Fujian:Han1934,Euclidean Distance,57.12%,58.21%
Han_Fujian:Han1934,Manhattan Distance,71.66%,72.08%
Han_Fujian:Han1934,Chebyshev Distance,2.40%,3.58%
Han_Fujian:Han1934,Jaccard Similarity,100.00%,100.00%
Han_Fujian:Han1934,Bray-Curtis Similarity,0.80%,2.27%
Han_Fujian:Han1934,Canberra Distance,15.28%,6.27%
Han_Fujian:Han1934,Correlation Coefficient,-35.52%,-37.06%
Han_Fujian:Han1934,Minkowski Distance,-214.70%,-209.31%
Han_Fujian:Han1994,Cosine Similarity,0.16%,2.30%
Han_Fujian:Han1994,Euclidean Distance,57.28%,58.38%
Han_Fujian:Han1994,Manhattan Distance,71.54%,71.96%
Han_Fujian:Han1994,Chebyshev Distance,3.00%,4.18%
Han_Fujian:Han1994,Jaccard Similarity,100.00%,100.00%
Han_Fujian:Han1994,Bray-Curtis Similarity,0.40%,1.87%
Han_Fujian:Han1994,Canberra Distance,14.79%,4.00%
Han_Fujian:Han1994,Correlation Coefficient,-35.99%,-37.50%
Han_Fujian:Han1994,Minkowski Distance,-213.07%,-207.64%

"""

# Use io.StringIO to read the data
df = pd.read_csv(StringIO(data))

# Convert percentage strings to floats
df["Ancient Israelite"] = df["Ancient Israelite"].str.rstrip('%').astype(float)
df["Ashkenazi"] = df["Ashkenazi"].str.rstrip('%').astype(float)

# Plot interactive radar chart using Plotly
def plot_interactive_radar_chart(sample_name):
    sample_data = df[df["Sample"] == sample_name]
    categories = sample_data["Method"].tolist()
    values_israelite = sample_data["Ancient Israelite"].tolist()
    values_ashkenazi = sample_data["Ashkenazi"].tolist()

    # Create a DataFrame for Plotly
    plot_data = pd.DataFrame({
        "Method": categories + categories,
        "Value": values_israelite + values_ashkenazi,
        "Group": ["Ancient Israelite"] * len(categories) + ["Ashkenazi"] * len(categories)
    })

    # Create the radar chart
    fig = px.line_polar(
        plot_data,
        r="Value",
        theta="Method",
        color="Group",
        line_close=True,
        markers=True,
        title=f"Interactive Radar Chart for {sample_name}",
        template="plotly_white",  # Modern and clean template
        range_r=[-20, 100]  # Set range for the radial axis
    )

    # Update layout for better readability
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                tickvals=[0, 20, 40, 60, 80, 100],
                ticktext=["0%", "20%", "40%", "60%", "80%", "100%"],
                tickfont=dict(size=12, color="gray"),
                gridcolor="lightgray",
                linecolor="gray"
            ),
            angularaxis=dict(
                tickfont=dict(size=12, color="black"),
                linecolor="gray",
                gridcolor="lightgray"
            )
        ),
        title_font=dict(size=20, family="Arial", color="black"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        hovermode="x unified"  # Show hover info for all groups at once
    )

    # Show the plot
    fig.show()

plot_interactive_radar_chart("yusupancentry")

In [65]:
import io
import csv
import numpy as np
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

# Define constants for key variables
REFERENCE_TARGET = 'Khotanese'
ASHKENAZI_IDENTIFIER = 'Uygur:HGDP01300'
NORMALIZATION_FACTOR = 100  # Used for normalization in distance calculations

# Define similarity and distance functions
def cosine_similarity_func(vector1, vector2):
    v1 = np.array(list(vector1.values())).reshape(1, -1)
    v2 = np.array(list(vector2.values())).reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

def euclidean_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.euclidean(v1, v2) / np.sqrt(len(v1) * NORMALIZATION_FACTOR**2)

def manhattan_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.cityblock(v1, v2) / (len(v1) * NORMALIZATION_FACTOR)

def chebyshev_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.chebyshev(v1, v2) / NORMALIZATION_FACTOR

def jaccard_similarity_func(vector1, vector2):
    v1 = set(vector1.keys())
    v2 = set(vector2.keys())
    return len(v1 & v2) / len(v1 | v2)

def bray_curtis_similarity_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.braycurtis(v1, v2)

def canberra_distance_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.canberra(v1, v2) / len(v1)

def correlation_coefficient_func(vector1, vector2):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return np.corrcoef(v1, v2)[0, 1]

def minkowski_distance_func(vector1, vector2, p=3):
    v1 = np.array(list(vector1.values()))
    v2 = np.array(list(vector2.values()))
    return 1 - distance.minkowski(v1, v2, p) / (len(v1) * NORMALIZATION_FACTOR**(1/p))

def main(data):
    # Initialize the CSV reader
    csv_reader = csv.DictReader(io.StringIO(data))

    # Extract reference data
    ancient_israelite = None
    ashkenazi_samples = []
    for row in csv_reader:
        if row['Target'] == REFERENCE_TARGET:
            ancient_israelite = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']}
        elif ASHKENAZI_IDENTIFIER in row['Target']:
            ashkenazi_samples.append({k: float(v) for k, v in row.items() if k not in ['Target', 'Distance']})

    # Validate references
    if ancient_israelite is None or not ashkenazi_samples:
        raise ValueError(f"{REFERENCE_TARGET} or {ASHKENAZI_IDENTIFIER} samples not found in data.")

    # Calculate average Ashkenazi profile
    avg_ashkenazi = {key: sum(sample[key] for sample in ashkenazi_samples) / len(ashkenazi_samples)
                     for key in ashkenazi_samples[0].keys()}

    # Define similarity calculation methods
    methods = {
        'Cosine Similarity': cosine_similarity_func,
        'Euclidean Distance': euclidean_distance_func,
        'Manhattan Distance': manhattan_distance_func,
        'Chebyshev Distance': chebyshev_distance_func,
        'Jaccard Similarity': jaccard_similarity_func,
        'Bray-Curtis Similarity': bray_curtis_similarity_func,
        'Canberra Distance': canberra_distance_func,
        'Correlation Coefficient': correlation_coefficient_func,
        'Minkowski Distance': minkowski_distance_func,
    }

    # Reset CSV reader
    csv_reader = csv.DictReader(io.StringIO(data))

    # Write results to CSV
    with open('similarity_results.csv', 'w', newline='') as csvfile:
        fieldnames = ['Sample', 'Method', 'Khotanese', 'Uygur:HGDP01300']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Process each sample
        for row in csv_reader:
            if row['Target'] != REFERENCE_TARGET and ASHKENAZI_IDENTIFIER not in row['Target']:
                sample = {k: float(v) for k, v in row.items() if k not in ['Target', 'Distance'] and v}

                # Skip samples with invalid data
                if not sample:
                    print(f"Skipping sample {row['Target']} due to missing or invalid data.")
                    continue

                for method_name, method in methods.items():
                    try:
                        israelite_sim = method(sample, ancient_israelite)
                        ashkenazi_sim = method(sample, avg_ashkenazi)
                        writer.writerow({
                            'Sample': row['Target'],
                            'Method': method_name,
                            'Khotanese': f"{israelite_sim:.2%}",
                            'Uygur:HGDP01300': f"{ashkenazi_sim:.2%}"
                        })
                    except Exception as e:
                        print(f"Error processing {row['Target']} with {method_name}: {e}")


if __name__ == "__main__":
    # Example dataset
    data = """Target,Distance,Zagros_Neolithic_Farmers,Eastern_European_Hunter_Gatherer,Siberian_Hunter_Gatherer_ANE,Caucasus_Hunter_Gatherer,Anatolia_Neolithic_Farmers,Natufian_Hunter_Gatherer,North_African_Neolithic_Farmer
Ancient_Israelite,0.02534301,16.4,0.0,0.0,11.6,49.8,22.2,0.0
yusupancentry,0.10122265,12.8,18.2,26.4,9.6,26.2,5.6,1.2
Ashkenazi_Austria:Austrian_Jew_ashkenazy2w,0.03792004,11.6,11.2,1.2,11.4,50.4,14.2,0.0
Ashkenazi_Austria:Austrian_Jew_ashkenazy3w,0.03413616,13.2,12.8,0.6,11.6,44.2,17.6,0.0
Ashkenazi_Belarussia:Ashk_BY_BY_1,0.03536681,13.6,14.6,1.6,10.2,50.6,9.4,0.0
Ashkenazi_Belarussia:Ashk_BY_BY_10,0.03466870,9.4,15.0,2.0,12.8,49.4,11.4,0.0
Ashkenazi_Belarussia:Ashk_BY_BY_8,0.03860332,8.6,12.2,1.8,16.4,47.8,11.8,1.4
Ashkenazi_Belarussia:Ashk_BY_BY_9,0.03848156,12.4,14.6,1.6,9.8,43.4,18.2,0.0
Ashkenazi_France:ashkenazy4w,0.02649095,12.6,10.8,0.6,11.4,48.6,16.0,0.0
Ashkenazi_France:FranceJewF38,0.04010708,14.4,11.0,0.0,9.2,51.0,13.4,1.0
Ashkenazi_France:GRC12118117,0.02783109,11.6,10.6,0.0,10.2,51.4,15.6,0.6
Ashkenazi_Germany:Ashk_DE_DE_1,0.02414584,13.8,9.0,0.0,10.4,49.4,17.4,0.0
Ashkenazi_Germany:Ashk_DE_DE_10,0.03574846,12.0,11.6,1.0,10.4,49.8,15.2,0.0
Ashkenazi_Germany:Ashk_DE_DE_9,0.03890542,10.4,9.4,0.0,15.4,49.4,13.6,1.8
Ashkenazi_Latvia:Latvian_Jew_ashkenazy3e,0.03567490,13.8,11.4,0.2,12.8,48.0,13.6,0.2
Ashkenazi_Latvia:Latvian_Jew_ashkenazy5e,0.04101438,13.2,10.8,2.2,11.8,46.4,15.6,0.0
Ashkenazi_Lithuania:Ashk_LT_LT_1,0.03472711,13.6,15.0,2.0,10.4,45.0,14.0,0.0
Ashkenazi_Lithuania:Ashk_LT_LT_8,0.03487516,14.0,12.4,0.8,12.0,43.8,17.0,0.0
Ashkenazi_Lithuania:Ashk_LT_LT_9,0.03503639,13.2,11.8,1.4,11.8,50.4,9.2,2.2
Ashkenazi_Poland:Ashk_PL_PL_1,0.03490842,10.0,13.4,2.4,14.4,46.4,13.4,0.0
Ashkenazi_Poland:Ashk_PL_PL_10,0.03568202,17.2,13.0,0.6,7.2,49.0,13.0,0.0
Ashkenazi_Russia:Ashk_RU_RU_1,0.03826277,13.4,11.6,1.8,12.4,47.0,13.8,0.0
Ashkenazi_Russia:Ashk_RU_RU_10,0.03458646,11.4,14.8,2.0,14.4,44.4,13.0,0.0
Ashkenazi_Russia:Ashk_RU_RU_11,0.03096678,9.8,12.8,2.0,11.8,51.0,12.6,0.0
Ashkenazi_Russia:Ashk_RU_RU_2,0.03698448,12.4,15.0,1.0,12.6,47.6,11.4,0.0
Ashkenazi_Ukraine:Ashk_UA_UA_8,0.04367551,9.6,14.8,2.2,14.0,47.0,12.4,0.0
Ashkenazi_Ukraine:Ashk_UA_UA_9,0.03320646,13.2,13.8,0.6,10.4,50.2,11.8,0.0
Uygur:HGDP01300,0.06349086,21.4,7.4,51.0,6.2,14.0,0.0,0.0
Khotanese,0.06008793,27.0,23.4,20.2,18.8,10.6,0.0,0.0

"""
    main(data)


In [66]:
import pandas as pd
import numpy as np
import plotly.express as px
from io import StringIO

data = """Sample,Method,Khotanese,Uygur:HGDP01300
Ancient_Israelite,Cosine Similarity,44.12%,33.31%
Ancient_Israelite,Euclidean Distance,78.79%,74.69%
Ancient_Israelite,Manhattan Distance,82.46%,81.89%
Ancient_Israelite,Chebyshev Distance,60.80%,49.00%
Ancient_Israelite,Jaccard Similarity,100.00%,100.00%
Ancient_Israelite,Bray-Curtis Similarity,38.60%,36.60%
Ancient_Israelite,Canberra Distance,41.00%,42.90%
Ancient_Israelite,Correlation Coefficient,-20.22%,-15.95%
Ancient_Israelite,Minkowski Distance,-39.84%,-76.99%
yusupancentry,Cosine Similarity,84.94%,84.22%
yusupancentry,Euclidean Distance,90.53%,88.11%
yusupancentry,Manhattan Distance,91.83%,90.51%
yusupancentry,Chebyshev Distance,84.40%,75.40%
yusupancentry,Jaccard Similarity,100.00%,100.00%
yusupancentry,Bray-Curtis Similarity,71.40%,66.80%
yusupancentry,Canberra Distance,51.96%,49.86%
yusupancentry,Correlation Coefficient,51.91%,71.66%
yusupancentry,Minkowski Distance,38.46%,18.14%

"""

# Use io.StringIO to read the data
df = pd.read_csv(StringIO(data))

# Convert percentage strings to floats
df["Uygur:HGDP01300"] = df["Uygur:HGDP01300"].str.rstrip('%').astype(float)
df["Khotanese"] = df["Khotanese"].str.rstrip('%').astype(float)

# Plot interactive radar chart using Plotly
def plot_interactive_radar_chart(sample_name):
    sample_data = df[df["Sample"] == sample_name]
    categories = sample_data["Method"].tolist()
    values_israelite = sample_data["Uygur:HGDP01300"].tolist()
    values_ashkenazi = sample_data["Khotanese"].tolist()

    # Create a DataFrame for Plotly
    plot_data = pd.DataFrame({
        "Method": categories + categories,
        "Value": values_israelite + values_ashkenazi,
        "Group": ["Uygur:HGDP01300"] * len(categories) + ["Khotanese"] * len(categories)
    })

    # Create the radar chart
    fig = px.line_polar(
        plot_data,
        r="Value",
        theta="Method",
        color="Group",
        line_close=True,
        markers=True,
        title=f"Interactive Radar Chart for {sample_name}",
        template="plotly_white",  # Modern and clean template
        range_r=[-20, 100]  # Set range for the radial axis
    )

    # Update layout for better readability
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                tickvals=[0, 20, 40, 60, 80, 100],
                ticktext=["0%", "20%", "40%", "60%", "80%", "100%"],
                tickfont=dict(size=12, color="gray"),
                gridcolor="lightgray",
                linecolor="gray"
            ),
            angularaxis=dict(
                tickfont=dict(size=12, color="black"),
                linecolor="gray",
                gridcolor="lightgray"
            )
        ),
        title_font=dict(size=20, family="Arial", color="black"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        hovermode="x unified"  # Show hover info for all groups at once
    )

    # Show the plot
    fig.show()

plot_interactive_radar_chart("yusupancentry")

In [64]:
import pandas as pd
import numpy as np
import plotly.express as px
from io import StringIO

data = """Sample,Method,Khotanese,Ashkenazi
Ancient_Israelite,Cosine Similarity,44.12%,96.28%
Ancient_Israelite,Euclidean Distance,78.79%,94.04%
Ancient_Israelite,Manhattan Distance,82.46%,95.94%
Ancient_Israelite,Chebyshev Distance,60.80%,87.46%
Ancient_Israelite,Jaccard Similarity,100.00%,100.00%
Ancient_Israelite,Bray-Curtis Similarity,38.60%,85.78%
Ancient_Israelite,Canberra Distance,41.00%,51.40%
Ancient_Israelite,Correlation Coefficient,-20.22%,93.53%
Ancient_Israelite,Minkowski Distance,-39.84%,57.46%
yusupancentry,Cosine Similarity,84.94%,76.81%
yusupancentry,Euclidean Distance,90.53%,86.80%
yusupancentry,Manhattan Distance,91.83%,90.78%
yusupancentry,Chebyshev Distance,84.40%,74.78%
yusupancentry,Jaccard Similarity,100.00%,100.00%
yusupancentry,Bray-Curtis Similarity,71.40%,67.74%
yusupancentry,Canberra Distance,51.96%,63.58%
yusupancentry,Correlation Coefficient,51.91%,46.93%
yusupancentry,Minkowski Distance,38.46%,7.41%
Uygur:HGDP01300,Cosine Similarity,76.26%,37.01%
Uygur:HGDP01300,Euclidean Distance,85.83%,76.18%
Uygur:HGDP01300,Manhattan Distance,90.23%,83.18%
Uygur:HGDP01300,Chebyshev Distance,69.20%,50.18%
Uygur:HGDP01300,Jaccard Similarity,100.00%,100.00%
Uygur:HGDP01300,Bray-Curtis Similarity,65.80%,41.12%
Uygur:HGDP01300,Canberra Distance,75.57%,37.98%
Uygur:HGDP01300,Correlation Coefficient,52.46%,-15.52%
Uygur:HGDP01300,Minkowski Distance,-1.18%,-69.43%

"""

# Use io.StringIO to read the data
df = pd.read_csv(StringIO(data))

# Convert percentage strings to floats
df["Ashkenazi"] = df["Ashkenazi"].str.rstrip('%').astype(float)
df["Khotanese"] = df["Khotanese"].str.rstrip('%').astype(float)

# Plot interactive radar chart using Plotly
def plot_interactive_radar_chart(sample_name):
    sample_data = df[df["Sample"] == sample_name]
    categories = sample_data["Method"].tolist()
    values_israelite = sample_data["Ashkenazi"].tolist()
    values_ashkenazi = sample_data["Khotanese"].tolist()

    # Create a DataFrame for Plotly
    plot_data = pd.DataFrame({
        "Method": categories + categories,
        "Value": values_israelite + values_ashkenazi,
        "Group": ["Ashkenazi"] * len(categories) + ["Khotanese"] * len(categories)
    })

    # Create the radar chart
    fig = px.line_polar(
        plot_data,
        r="Value",
        theta="Method",
        color="Group",
        line_close=True,
        markers=True,
        title=f"Interactive Radar Chart for {sample_name}",
        template="plotly_white",  # Modern and clean template
        range_r=[-20, 100]  # Set range for the radial axis
    )

    # Update layout for better readability
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                tickvals=[0, 20, 40, 60, 80, 100],
                ticktext=["0%", "20%", "40%", "60%", "80%", "100%"],
                tickfont=dict(size=12, color="gray"),
                gridcolor="lightgray",
                linecolor="gray"
            ),
            angularaxis=dict(
                tickfont=dict(size=12, color="black"),
                linecolor="gray",
                gridcolor="lightgray"
            )
        ),
        title_font=dict(size=20, family="Arial", color="black"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        hovermode="x unified"  # Show hover info for all groups at once
    )

    # Show the plot
    fig.show()

plot_interactive_radar_chart("yusupancentry")