In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

import os

np.random.seed(42)
import warnings
warnings.filterwarnings("ignore")

In [170]:
#{"A":"перегородочный", "B":"передний", "C":"боковой", "D":"передне-боковой", "E":"пердне-перегородочный", "F":"нижний"}
# gts = pd.read_csv("../task_final/train/train_gts.csv")
gts_final = gts = pd.read_csv("./train/train_gts_final.csv")
meta = pd.read_csv("./train/train_meta.csv")

df = meta.merge(gts_final, on='record_name')

Количество записей, где только одна единичка

In [171]:
df[['перегородочный', "передний", "боковой", "передне-боковой", "передне-перегородочный", "нижний"]].sum(axis=1).value_counts(normalize=True)

0    0.802475
1    0.168015
2    0.029034
3    0.000476
dtype: float64

количество единичек по каждой колонке

In [172]:
col_popularity = {}
for column in ['перегородочный', "передний", "боковой", "передне-боковой", "передне-перегородочный", "нижний"]:
    res = df[column].value_counts(normalize=True)[1]
    col_popularity.update({column:res})
col_popularity

{'перегородочный': 0.02760590195145169,
 'передний': 0.02617801047120419,
 'боковой': 0.0009519276534983341,
 'передне-боковой': 0.012375059495478343,
 'передне-перегородочный': 0.05663969538315088,
 'нижний': 0.10376011423131842}

In [173]:
df[df["перегородочный"]==0]

Unnamed: 0,patient_id,age,sex,height,weight,record_name,перегородочный,передний,боковой,передне-боковой,передне-перегородочный,нижний,норма
0,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1
1,13619.0,56.0,0,,,00034_hr,0,0,0,0,0,0,1
2,11315.0,25.0,1,,63.0,00043_hr,0,0,0,0,0,0,1
3,18153.0,35.0,0,,82.0,00052_hr,0,0,0,0,0,0,1
4,16063.0,26.0,0,,93.0,00057_hr,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2096,12488.0,66.0,1,,,21784_hr,0,0,0,0,0,0,1
2097,10162.0,68.0,0,,,21795_hr,0,0,0,0,0,0,1
2098,11197.0,59.0,0,,,21825_hr,0,0,0,0,0,0,1
2099,11905.0,55.0,1,,,21831_hr,0,0,0,0,0,0,1


In [174]:
df["target"] = np.argmax(df[["перегородочный", "передний", "боковой", "передне-боковой", 
                             "передне-перегородочный", "нижний", "норма"]].to_numpy(), axis=1)

In [175]:
df

Unnamed: 0,patient_id,age,sex,height,weight,record_name,перегородочный,передний,боковой,передне-боковой,передне-перегородочный,нижний,норма,target
0,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1,6
1,13619.0,56.0,0,,,00034_hr,0,0,0,0,0,0,1,6
2,11315.0,25.0,1,,63.0,00043_hr,0,0,0,0,0,0,1,6
3,18153.0,35.0,0,,82.0,00052_hr,0,0,0,0,0,0,1,6
4,16063.0,26.0,0,,93.0,00057_hr,0,0,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2096,12488.0,66.0,1,,,21784_hr,0,0,0,0,0,0,1,6
2097,10162.0,68.0,0,,,21795_hr,0,0,0,0,0,0,1,6
2098,11197.0,59.0,0,,,21825_hr,0,0,0,0,0,0,1,6
2099,11905.0,55.0,1,,,21831_hr,0,0,0,0,0,0,1,6


In [176]:
df_test = pd.read_csv("./test/test_meta.csv")

In [177]:
def load_record(record_name, is_train):
    if is_train:
        with open(f"../task_final/train/{record_name}.npy", "rb") as f:
            data  = np.load(f, allow_pickle=True)
    else:
        with open(f"../task_final/test/{record_name}.npy", "rb") as f:
            data  = np.load(f, allow_pickle=True)
    return data

def print_beat(x, y, ecg_channel):
    fig, ax = plt.subplots()  # Create a figure containing a single axes.
    ax.set_title(f"ECG channel {ecg_channel}")
    ax.plot(x, y)  # Plot some data on the axes.

In [178]:
import multiprocessing



class Transformator():
    """ Класс-пайплайн от датафрейма train.gts до трансформированных данных ЭКГ.
    
    """
    def __init__(self,args, transformation_func, is_train):
        self.is_train = is_train
        self.args = args
        self.transformation_func = transformation_func
    
    def run_pipeline(self, df):
        
        result_df = self.pipeline_ecg(self.transformation_func, df)
        return result_df

    # transformation func - function that transforms record to any 
    def pipeline_ecg(self, transformation_func, df):

        result_df = df.copy()
        record_names = result_df['record_name'].to_list()
        result_df['correct_transformation'] = True
        path = ""
        if self.is_train: 
            if(not os.path.exists("./transformed_train/")):
                os.mkdir("./transformed_train/")
            path = "./transformed_train/"
        else:
            if(not os.path.exists("./transformed_test/")):
                os.mkdir("./transformed_test/")
            path = "./transformed_test/"

        new_names_column = [] 
        for record_name in tqdm(record_names):
            transformed = transformation_func(record_name, **self.args)
            # transformed - shape [12, 9] if preprocessing_with beats

            new_names = []
            for i in range(transformed.shape[1]):
                name = f"{record_name}_n{i}"
                new_names.append(name)
                np.save(path+name+".npy", transformed[:, i])
            new_names_column.append(new_names)
        result_df["new_name"] = new_names_column

        result_df = result_df.explode("new_name", ignore_index=True) 
        return result_df 

    def check_correctnes(self, df):
        test_df = df.iloc[:3]
        self.run_pipeline(test_df)


In [179]:
from preprocessing_with_beats import PipelineBeatExtraction
pipeline = PipelineBeatExtraction(prefix="./train", noise_level=2)
pipeline_test = PipelineBeatExtraction(prefix="./test", noise_level=2)
pipeline_func_test = pipeline_test.run_pipeline
pipeline_func = pipeline.run_pipeline

transformator = Transformator({}, pipeline_func, is_train=True )
transformator_test = Transformator({}, pipeline_func_test, is_train=False)


Проверка на правильность работы.

In [180]:
transformator.check_correctnes(df)

  0%|          | 0/3 [00:00<?, ?it/s]

In [181]:
transformator_test.check_correctnes(df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Запуск препроцессинга для тренировочных данных в многопроцессном режиме.

In [14]:
# from joblib import Parallel, delayed

# def process_dataframe(df):
#     result_df = transformator.pipeline_ecg(transformator.transformation_func, df)
#     return result_df

# process_num = 12

# result_test_dfs = Parallel(
#     n_jobs=process_num,
#     verbose=100
# )(
#     delayed(process_dataframe)(x) for x in np.array_split(df, process_num)
# )

# result_df = pd.concat(result_test_dfs)
# result_df.to_csv("transformed_df.csv")

In [15]:
# Define the process_dataframe function
process_num = 12

def process_dataframe(df):
    result_df = transformator.pipeline_ecg(transformator.transformation_func, df)
    return result_df


# Split your data into multiple dataframes
dfs = np.array_split(df, process_num) 


# Create a pool of worker processes
pool = multiprocessing.Pool(processes=process_num)

result_dfs = []
# Run the process_dataframe function in parallel for each dataframe
for result_df in tqdm(pool.imap(process_dataframe, dfs)):
    result_dfs.append(result_df)

# Close the pool of worker processes
pool.close()
pool.join()

result_df = pd.concat(result_dfs)
result_df.to_csv("transformed_df.csv")

0it [00:00, ?it/s]

Запуск препроцессинга для тестовых данных.

In [183]:
#from joblib import Parallel, delayed

# def process_dataframe(df):
#     result_df = transformator_test.pipeline_ecg(transformator_test.transformation_func, df)
#     return result_df

process_num = 12

# result_test_dfs = Parallel(
#     n_jobs=process_num,
#     verbose=100
# )(
#     delayed(process_dataframe)(x) for x in np.array_split(df_test, process_num)
# )

# result_test_df = pd.concat(result_test_dfs)
# result_test_df.to_csv("transformed_test_df.csv")

In [184]:
def process_dataframe(df):
    result_df = transformator_test.pipeline_ecg(transformator_test.transformation_func, df)
    return result_df

# Split your data into multiple dataframes
test_dfs = np.array_split(df_test, process_num) 


# Create a pool of worker processes
pool = multiprocessing.Pool(processes=process_num)

result_test_dfs = []
# Run the process_dataframe function in parallel for each dataframe
#result_dfs = pool.map(process_dataframe, dfs)
for result_test_df in tqdm(pool.imap(process_dataframe, test_dfs)):
    result_test_dfs.append(result_test_df)

# Close the pool of worker processes
pool.close()
pool.join()

result_test_df = pd.concat(result_test_dfs)
result_test_df.to_csv("transformed_test_df.csv")

0it [00:00, ?it/s]

In [185]:
result_df = pd.read_csv('transformed_df.csv')

In [187]:
result_df

Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,height,weight,record_name,перегородочный,передний,боковой,передне-боковой,передне-перегородочный,нижний,норма,target,correct_transformation,new_name
0,0,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1,6,True,00009_hr_n0
1,1,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1,6,True,00009_hr_n1
2,2,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1,6,True,00009_hr_n2
3,3,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1,6,True,00009_hr_n3
4,4,18792.0,55.0,0,,70.0,00009_hr,0,0,0,0,0,0,1,6,True,00009_hr_n4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21635,1793,20703.0,300.0,0,,,21834_hr,0,0,0,0,0,1,0,5,True,21834_hr_n5
21636,1794,20703.0,300.0,0,,,21834_hr,0,0,0,0,0,1,0,5,True,21834_hr_n6
21637,1795,20703.0,300.0,0,,,21834_hr,0,0,0,0,0,1,0,5,True,21834_hr_n7
21638,1796,20703.0,300.0,0,,,21834_hr,0,0,0,0,0,1,0,5,True,21834_hr_n8


Там где мульти-лейбл меняем на класс, которого представлено меньше

In [188]:
def turn_in_multiclass(df):
    def replace_with_sum(row):
        row_new=row
        for i, v in row.items():
            if v==1:
                row_new[i]=df[i].sum()
        return row_new

    def keks2(row):
        mn = row.max()
        for i, v in row.items():
            if v!=0 and v<mn:
                mn=v
        if mn==0:
            return row
        else:
            new_row =  row.where(row != mn, 1)
            new_row[(new_row!=0)&(new_row!=1)]=0
            return new_row

    keks = df[df[["перегородочный","передний","боковой","передне-боковой","передне-перегородочный","нижний"]].sum(axis=1)>1]

    # Apply the function to each row in the DataFrame
    res = keks.apply(replace_with_sum,axis=1)
    res = res[["перегородочный","передний","боковой","передне-боковой","передне-перегородочный","нижний"]].apply(keks2, axis=1)
    meow = df.copy() 
    meow.update(res)
    return meow 
    
result_df = turn_in_multiclass(result_df)
#new
#new[new[["перегородочный","передний","боковой","передне-боковой","передне-перегородочный","нижний"]].sum(axis=1)>1]

In [189]:
result_df[result_df[["перегородочный","передний","боковой","передне-боковой","передне-перегородочный","нижний"]].sum(axis=1)>1]

Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,height,weight,record_name,перегородочный,передний,боковой,передне-боковой,передне-перегородочный,нижний,норма,target,correct_transformation,new_name


Создание аннотаций для тренировки модели.

In [190]:
from sklearn.model_selection import train_test_split, StratifiedGroupKFold

In [191]:
def downsample(df, fraction=1):
        """
        function to downsample majority class.
        """
        # Separate the majority and minority classes
        majority_class = df[df['норма'] == 1]
        minority_class = df[df['норма'] != 1]  
        
        # Calculate the number of samples needed from the majority class
        num_samples = len(minority_class)
        
        downsampled_majority = majority_class.sample(frac=fraction, replace=True) 
        
        downsampled_df = pd.concat([downsampled_majority, minority_class], axis=0)
        
        return downsampled_df

def oversample(df):
    """
    Function to oversample the minority class by copying rows.
    """
    minority_classes = ["перегородочный", "передний", "боковой", "передне-боковой", "передне-перегородочный", "нижний"]
    majority_class = df[df['норма'] == 1]

    dfs = [majority_class]
    for minority_class in minority_classes:
        if(minority_class=="боковой"):
            continue
        # Separate the majority and minority classes
        minority_df = df[df[minority_class] == 1]
        
        # Calculate the number of samples needed from the majority class
        num_samples = len(majority_class)
        frac = num_samples / len(minority_df)
        
        # Oversample by copying rows from the minority class
        print(minority_class, num_samples)
    
        oversampled_minority = minority_df.sample(frac=frac, replace=True)
        dfs.append(oversampled_minority.copy())
        
    oversampled_df = pd.concat(dfs, axis=0)
        
    return oversampled_df


def prepare_balance(df, column="передне-боковой"):
    """
    Function to oversample the minority class by copying rows.
    """
    minority_classes = [x for x in ["перегородочный", "передний", "боковой", "передне-боковой", "передне-перегородочный", "нижний", "норма"]
                        if x != column]
    majority_class = df[df[column] == 1]
    num_samples = round(len(majority_class) / 6)

    dfs = [majority_class]
    for minority_class in minority_classes:
        # Separate the majority and minority classes
        minority_df = df[df[minority_class] == 1]
        
        # Calculate the number of samples needed from the majority class
        frac = num_samples / len(minority_df)
        
        # Oversample by copying rows from the minority class
        print(minority_class, frac, len(minority_df))
        oversampled_minority = minority_df.sample(frac=frac, replace=True)
        print(len(oversampled_minority))
        dfs.append(oversampled_minority.copy())
        
    oversampled_df = pd.concat(dfs, axis=0)
        
    return oversampled_df


In [192]:
def pipeline_annotations(df):
    annotations = df[['new_name', 'target', "перегородочный", "передний", "боковой", "передне-боковой", 
                             "передне-перегородочный", "нижний", "норма"]]   
    cv = StratifiedGroupKFold(n_splits=5, random_state=0, shuffle=True)
    train_idxs, test_idxs = next(cv.split(annotations["new_name"], annotations["target"], df["record_name"]))
    

    val_annotations = annotations.iloc[test_idxs]
    train_annotations = annotations.iloc[train_idxs]
    train_annotations = oversample(train_annotations)
    # shuffle rows
    val_annotations = val_annotations.sample(frac=1)
    train_annotations = train_annotations.sample(frac=1)
    
    val_annotations.to_csv('./val_annotations.csv', index=False)
    train_annotations.to_csv('./train_annotations.csv', index=False)
    
    print("number of validation samples:",len(val_annotations) )
    print("number of train samples:",len(train_annotations))

    print("validation percentage:", len(val_annotations) / len(annotations))
    print("train percentage:",len(train_annotations) / len(annotations))

    print("VALIDATION myocard percentage", val_annotations["target"].value_counts(normalize=True))
    print("TTRAIN myocard percentage", train_annotations["target"].value_counts(normalize=True))


In [193]:
downsampled_df = downsample(result_df, fraction=0.4)

In [162]:
oversampled_df = oversample(downsampled_df)

перегородочный 6920
передний 6920
передне-боковой 6920
передне-перегородочный 6920
нижний 6920


In [163]:
# new_df = prepare_balance(result_df)

In [164]:
oversampled_df.sum()

Unnamed: 0                                                         37218975
patient_id                                                      470615244.0
age                                                               2927238.0
sex                                                                   17118
height                                                            2237849.0
weight                                                             979083.0
record_name               15483_hr11338_hr07309_hr17115_hr09095_hr00932_...
перегородочный                                                       6920.0
передний                                                             6920.0
боковой                                                                 0.0
передне-боковой                                                      6920.0
передне-перегородочный                                               6920.0
нижний                                                               6920.0
норма       

In [194]:
pipeline_annotations(downsampled_df)

перегородочный 5483
передний 5483
передне-боковой 5483
передне-перегородочный 5483
нижний 5483
number of validation samples: 2218
number of train samples: 32898
validation percentage: 0.1969979571898037
train percentage: 2.9219291233679723
VALIDATION myocard percentage 6    0.647881
5    0.127592
4    0.102795
1    0.054554
0    0.050947
3    0.016231
Name: target, dtype: float64
TTRAIN myocard percentage 1    0.174691
4    0.166667
6    0.166667
5    0.166667
0    0.166667
3    0.158642
Name: target, dtype: float64


In [166]:
from sklearn.utils.class_weight import compute_class_weight
compute_class_weight(class_weight='balanced', classes=downsampled_df.target.unique(), y=downsampled_df.target.to_numpy())

array([ 0.23243187,  0.9765808 ,  1.3194656 ,  2.63676815,  2.72615012,
       69.93167702,  6.43371429])

In [167]:
downsampled_df.target.unique()

array([6, 5, 4, 0, 1, 2, 3])