## Kolmogorov-Smirnov Test

In [15]:
import numpy as np
from scipy.stats import ks_2samp

# Example datasets
data1 = np.random.normal(0, 1, 1000)
data2 = np.random.normal(0.2, 1, 1000)

# Perform the KS test
ks_statistic, p_value = ks_2samp(data1, data2)

print(f"KS Statistic: {ks_statistic}")
print(f"P-Value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("The distributions are different.")
else:
    print("The distributions are the same.")


KS Statistic: 0.067
P-Value: 0.022438659451142425
The distributions are different.


In [13]:
import numpy as np
from scipy.stats import mannwhitneyu

# Example datasets
data1 = np.random.normal(0, 1, 1000)
data2 = np.random.normal(0.1, 1, 1000)

# Perform the Mann-Whitney U test
u_statistic, p_value = mannwhitneyu(data1, data2)

print(f"U Statistic: {u_statistic}")
print(f"P-Value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("The distributions are different.")
else:
    print("The distributions are the same.")


U Statistic: 472850.0
P-Value: 0.035512503287260064
The distributions are different.


In [16]:
import pandas as pd
from data import CPP
data = pd.read_csv("/home/amirka/CPP/CPPLM/data/cpp.csv").T.to_dict()
cpps = [CPP(datapoint["sequence"], datapoint['intensity']) for datapoint in data.values()]
seqs = [cpp['sequence'] for cpp in cpps]
intensities = [cpp['intensity'] for cpp in cpps]

In [58]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp
from tokenization.tokenizer import CPPTokenizer

train_data, temp_data = train_test_split(intensities, test_size=0.4)
val_data, test_data = train_test_split(temp_data, test_size=0.5)

# Verify the distribution using the Kolmogorov-Smirnov test
train_val_ks_stat, train_val_p_value = ks_2samp(train_data, val_data)
train_test_ks_stat, train_test_p_value = ks_2samp(train_data, test_data)
val_test_ks_stat, val_test_p_value = ks_2samp(val_data, test_data)

print(f"Train vs. Validation KS Statistic: {train_val_ks_stat}, P-Value: {train_val_p_value}")
print(f"Train vs. Test KS Statistic: {train_test_ks_stat}, P-Value: {train_test_p_value}")
print(f"Validation vs. Test KS Statistic: {val_test_ks_stat}, P-Value: {val_test_p_value}")

# Interpretation
if train_val_p_value >= 0.1 and train_test_p_value >= 0.1 and val_test_p_value >= 0.1:
    print("The distributions are similar across training, validation, and test sets.")
else:
    print("There is a significant difference in the distributions.")

NameError: name '__file__' is not defined