In [2]:
import datetime
import matplotlib as plt

import os
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["OMPI_MCA_opal_cuda_support"] = "true"
os.environ["CONDA_OVERRIDE_GLIBC"] = "2.56"

import pickle
import random
import subprocess

from datasets import Dataset
import pandas as pd
import numpy as np
import pytz
import torch
from datasets import load_dataset
from datasets import load_from_disk

Sampled and processed for binary classification

In [30]:
def uniform_fixed_sampling(data, window_size, num_sections):
    # Split data into sections
    sections = np.array_split(data, num_sections)

    # Initialize list to hold samples
    samples = []

    # For each position within a section
    for pos in range(window_size):
        # Sample the same position from each section
        sample = [section[pos] for section in sections]
        samples.append(sample)

    return samples

sample_path = "/share/home/liangzhongming/930/CGMformer/data/Shanghai_finetune"
save_path = "/share/home/liangzhongming/930/CGMformer/data/Shanghai_finetune_subSampleV3_2cls"

train_dataset=load_from_disk(sample_path)
# Determine the number of sections based on the window size
original_length = len(train_dataset['input_ids'][0])  # assuming all sequences have the same length
window_size = 3
num_sections = original_length // window_size

# Initialize lists to hold the sampled data
sampled_input_ids = []
sampled_types = []      

# Loop over the original data
cnt = 0
for input_ids, type_ in zip(train_dataset['input_ids'], train_dataset['types']):

    # Sample the input_ids sequence
    sampled_ids = uniform_fixed_sampling(input_ids, window_size, num_sections)
    
    # Repeat the type_ for the number of samples
    sampled_type_ = [type_] * len(sampled_ids)
    
    # Append to the lists
    sampled_input_ids.extend(sampled_ids)
    sampled_types.extend(sampled_type_)

# Create a new dataset from the sampled data
sampled_dataset = Dataset.from_dict({
    'input_ids': sampled_input_ids,
    'types': sampled_types
})

# Save the new dataset
sampled_dataset.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/570 [00:00<?, ? examples/s]

In [32]:
from collections import Counter
train_datsset_path = "/share/home/liangzhongming/930/CGMformer/data/Shanghai_finetune_subSampleV3_2cls"
train_dataset=load_from_disk(train_datsset_path)
train_dataset['types']
target_names = set(list(Counter(train_dataset['types']).keys()))
target_names

cnt_0 = 0
cnt_1 = 0
cnt_2 = 0
len(train_dataset['types'])
for i in range(len(train_dataset['types'])):
    if train_dataset['types'][i] == 0:
        cnt_0 += 1
    elif train_dataset['types'][i] == 1:
        cnt_1 += 1
    elif train_dataset['types'][i] == 2:
        cnt_2 += 1
cnt_0, cnt_1, cnt_2

(270, 300, 0)

Subsequent sequence sampling analysis

In [3]:
train_datsset_path = "/share/home/liangzhongming/930/CGMformer/data/8_7_data/Shanghai_downsampled_144"
train_dataset=load_from_disk(train_datsset_path)
train_dataset, len(train_dataset['input_ids']), len(train_dataset['input_ids'][0]), len(train_dataset['types'])

(Dataset({
     features: ['id', 'types', 'age', 'bmi', 'hba1c', 'homa-b', 'homa-is', 'index', 'Fast_s', 'Fast_e', 'Dawn_s', 'Dawn_e', 'Breakfast_s', 'Breakfast_e', 'Lunch_s', 'Lunch_e', 'Dinner_s', 'Dinner_e', 'input_ids'],
     num_rows: 1981
 }),
 1981,
 144,
 1981)

In [9]:
train_datsset_path = "/share/home/liangzhongming/930/CGMformer/data/Shanghai_train_subSample"
train_dataset=load_from_disk(train_datsset_path)
train_dataset, len(train_dataset['input_ids']), len(train_dataset['input_ids'][0]), len(train_dataset['types']), train_dataset['types'][0] 

(Dataset({
     features: ['input_ids', 'types'],
     num_rows: 13480
 }),
 13480,
 96,
 13480,
 1)

In [None]:
from datasets import Dataset

def sliding_window_sampling(data, window_size, step_size):
    samples = []
    for i in range(0, len(data) - window_size + 1, step_size):
        sample = data[i:i + window_size]
        samples.append(sample)
    return samples

# Determine step size based on the number of required samples
original_length = len(train_dataset['input_ids'][0])  # assuming all sequences have the same length
num_samples = 10
window_size = 96
step_size = (original_length - window_size) // (num_samples - 1)

# Initialize lists to hold the sampled data
sampled_input_ids = []
sampled_types = []

# Loop over the original data
for input_ids, type_ in zip(train_dataset['input_ids'], train_dataset['types']):
    # Sample the input_ids sequence
    sampled_ids = sliding_window_sampling(input_ids, window_size, step_size)
    
    # Repeat the type_ for the number of samples
    sampled_type_ = [type_] * len(sampled_ids)
    
    # Append to the lists
    sampled_input_ids.extend(sampled_ids)
    sampled_types.extend(sampled_type_)

# Create a new dataset from the sampled data
sampled_dataset = Dataset.from_dict({
    'input_ids': sampled_input_ids,
    'types': sampled_types
})

# Save the new dataset
sampled_dataset.save_to_disk("/share/home/liangzhongming/930/CGMformer/data/Shanghai_train_subSample")


In [12]:
from scipy.stats import ks_2samp
from statsmodels.tsa.stattools import acf, adfuller
from scipy.signal import welch
from scipy.stats import chisquare

sample_path = "/share/home/liangzhongming/930/CGMformer/data/Shanghai_train"
save_path = "/share/home/liangzhongming/930/CGMformer/data/Shanghai_train_subSampleV3"

train_dataset = load_from_disk(sample_path)
sampled_dataset = load_from_disk(save_path)

# Calculate statistics for original data
original_means = [np.mean(seq) for seq in train_dataset['input_ids']]
original_vars = [np.var(seq) for seq in train_dataset['input_ids']]
original_acfs = [acf(seq, nlags=50) for seq in train_dataset['input_ids']]
original_p_values = [adfuller(seq)[1] for seq in train_dataset['input_ids']]
original_psds = [welch(seq)[1] for seq in train_dataset['input_ids']]  # select PSDs

# Calculate statistics for sampled data
sampled_means = [np.mean(seq) for seq in sampled_dataset['input_ids']]
sampled_vars = [np.var(seq) for seq in sampled_dataset['input_ids']]
sampled_acfs = [acf(seq, nlags=50) for seq in sampled_dataset['input_ids']]
sampled_p_values = [adfuller(seq)[1] for seq in sampled_dataset['input_ids']]
sampled_psds = [welch(seq)[1] for seq in sampled_dataset['input_ids']]  # select PSDs

# Compare distributions of means, vars, acfs and psds using Kolmogorov-Smirnov test
ks_result_means = ks_2samp(original_means, sampled_means)
ks_result_vars = ks_2samp(original_vars, sampled_vars)
ks_result_acfs = ks_2samp(np.concatenate(original_acfs), np.concatenate(sampled_acfs))
ks_result_psds = ks_2samp(np.concatenate(original_psds), np.concatenate(sampled_psds))

# Print KS test results
print("KS test result for means:", ks_result_means)
print("KS test result for vars:", ks_result_vars)
print("KS test result for ACFs:", ks_result_acfs)
print("KS test result for PSDs:", ks_result_psds)



KS test result for means: KstestResult(statistic=0.003214638971315529, pvalue=1.0, statistic_location=103.33749999999999, statistic_sign=1)
KS test result for vars: KstestResult(statistic=0.004698318496538081, pvalue=1.0, statistic_location=450.92937500000005, statistic_sign=1)
KS test result for ACFs: KstestResult(statistic=0.27097515564089136, pvalue=0.0, statistic_location=0.1796633494112581, statistic_sign=-1)
KS test result for PSDs: KstestResult(statistic=0.31095972306526637, pvalue=0.0, statistic_location=17.56655374127823, statistic_sign=1)
