In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

import os

np.random.seed(42)

In [2]:

gts = pd.read_csv("./train/train_gts.csv")
meta = pd.read_csv("./train/train_meta.csv")

df = meta.merge(gts, on='record_name', )

In [3]:
df_test = pd.read_csv("./test/test_meta.csv")

In [4]:
def load_record(record_name, is_train):
    if is_train:
        with open(f"./train/{record_name}.npy", "rb") as f:
            data  = np.load(f, allow_pickle=True)
    else:
        with open(f"./test/{record_name}.npy", "rb") as f:
            data  = np.load(f, allow_pickle=True)
    return data

def print_beat(x, y, ecg_channel):
    fig, ax = plt.subplots()  # Create a figure containing a single axes.
    ax.set_title(f"ECG channel {ecg_channel}")
    ax.plot(x, y)  # Plot some data on the axes.

In [6]:
import multiprocessing



class Transformator():
    """ Класс-пайплайн от датафрейма train.gts до трансформированных данных ЭКГ и аннотаций для тренировки модели.
    
    """
    def __init__(self,args, transformation_func, is_train):
        self.is_train = is_train
        self.args = args
        self.transformation_func = transformation_func
    
    def run_pipeline(self, df):
        
        result_df = self.pipeline_ecg(self.transformation_func, df)
        return result_df

    # transformation func - function that transforms record to any 
    def pipeline_ecg(self, transformation_func, df):

        result_df = df.copy()
        record_names = result_df['record_name'].to_list()
        result_df['correct_transformation'] = True
        path = ""
        if self.is_train: 
            if(not os.path.exists("./transformed_train/")):
                os.mkdir("./transformed_train/")
            path = "./transformed_train/"
        else:
            if(not os.path.exists("./transformed_test/")):
                os.mkdir("./transformed_test/")
            path = "./transformed_test/"

        new_names_column = [] 
        for record_name in tqdm(record_names):
            transformed = transformation_func(record_name, **self.args)
            # transformed - shape [12, 9] if preprocessing_with beats

            new_names = []
            for i in range(transformed.shape[1]):
                name = f"{record_name}_n{i}"
                new_names.append(name)
                np.save(path+name+".npy", transformed[:, i])
            new_names_column.append(new_names)
        result_df["new_name"] = new_names_column

        result_df = result_df.explode("new_name", ignore_index=True) 
        return result_df 

    def check_correctnes(self, df):
        test_df = df.iloc[:3]
        self.run_pipeline(test_df)


In [7]:
from preprocessing_with_beats import PipelineBeatExtraction
pipeline = PipelineBeatExtraction(prefix="./train", noise_level=2)
pipeline_test = PipelineBeatExtraction(prefix="./test", noise_level=2)
pipeline_func_test = pipeline_test.run_pipeline
pipeline_func = pipeline.run_pipeline

transformator = Transformator({}, pipeline_func, is_train=True )
transformator_test = Transformator({}, pipeline_func_test, is_train=False)


Проверка на правильность работы.

In [8]:
transformator.check_correctnes(df)

  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
transformator_test.check_correctnes(df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
# Define the process_dataframe function
process_num = 12

def process_dataframe(df):
    result_df = transformator.pipeline_ecg(transformator.transformation_func, df)
    return result_df


# Split your data into multiple dataframes
dfs = np.array_split(df, process_num) 


# Create a pool of worker processes
pool = multiprocessing.Pool(processes=process_num)

result_dfs = []
# Run the process_dataframe function in parallel for each dataframe
#result_dfs = pool.map(process_dataframe, dfs)
for result_df in tqdm(pool.imap(process_dataframe, dfs)):
    result_dfs.append(result_df)

# Close the pool of worker processes
pool.close()
pool.join()

result_df = pd.concat(result_dfs)
result_df.to_csv("transformed_df.csv")

0it [00:00, ?it/s]

In [11]:
def process_dataframe(df):
    result_df = transformator_test.pipeline_ecg(transformator_test.transformation_func, df)
    return result_df

# Split your data into multiple dataframes
test_dfs = np.array_split(df_test, process_num) 


# Create a pool of worker processes
pool = multiprocessing.Pool(processes=process_num)

result_test_dfs = []
# Run the process_dataframe function in parallel for each dataframe
#result_dfs = pool.map(process_dataframe, dfs)
for result_test_df in tqdm(pool.imap(process_dataframe, test_dfs)):
    result_test_dfs.append(result_test_df)

# Close the pool of worker processes
pool.close()
pool.join()

result_test_df = pd.concat(result_test_dfs)
result_test_df.to_csv("transformed_test_df.csv")

0it [00:00, ?it/s]

In [12]:
def pipeline_annotations(df):
    annotations = df[['new_name', 'strat_fold','myocard']]   

    val_annotations = annotations[annotations['strat_fold'].isin([8, 9, 10])]
    train_annotations = annotations[annotations['strat_fold'].isin(range(1,8))]
    
    # shuffle rows
    val_annotations = val_annotations.sample(frac=1)
    train_annotations = train_annotations.sample(frac=1)
    
    val_annotations.to_csv('./val_annotations.csv', index=False)
    train_annotations.to_csv('./train_annotations.csv', index=False)
    
    print("number of validation samples:",len(val_annotations) )
    print("number of train samples:",len(train_annotations))

    print("validation percentage:", len(val_annotations) / len(annotations))
    print("train percentage:",len(train_annotations) / len(annotations))

    print("VALIDATION myocard percentage", val_annotations["myocard"].value_counts(normalize=True))
    print("TTRAIN myocard percentage", train_annotations["myocard"].value_counts(normalize=True))


In [13]:
def downsample(df, fraction=1):
        """
        function to downsample majority class.
        """
        # Separate the majority and minority classes
        majority_class = df[df['myocard'] == 0]
        minority_class = df[df['myocard'] == 1]  
        
        # Calculate the number of samples needed from the majority class
        num_samples = len(minority_class)
        
        downsampled_majority = majority_class.sample(n=int(num_samples*fraction)) 
        
        downsampled_df = pd.concat([downsampled_majority, minority_class], axis=0)
        
        return downsampled_df

def oversample(df):
    """
    Function to oversample the minority class by copying rows.
    """
    # Separate the majority and minority classes
    majority_class = df[df['myocard'] == 0]
    minority_class = df[df['myocard'] == 1]  
    
    # Calculate the number of samples needed from the majority class
    num_samples = len(majority_class) - len(minority_class)
    
    # Oversample by copying rows from the minority class
    oversampled_minority = minority_class.sample(n=num_samples, replace=True)
    
    oversampled_df = pd.concat([majority_class, oversampled_minority], axis=0)
    
    return oversampled_df


In [14]:
downsampled_df = downsample(result_df, fraction=1)
#oversampled_df = oversample(result_df)


In [15]:
pipeline_annotations(downsampled_df)

number of validation samples: 2795
number of train samples: 5851
validation percentage: 0.3232708767059912
train percentage: 0.6767291232940088
VALIDATION myocard percentage 1    0.514132
0    0.485868
Name: myocard, dtype: float64
TTRAIN myocard percentage 0    0.506751
1    0.493249
Name: myocard, dtype: float64
