# Analyze duplicate ECGs

In [28]:
import pandas as pd
import numpy as np

#### Duplicate analysis

In [30]:
# Function to drop columns containing specified substrings
def drop_columns_containing(df, substrings):
    """
    Drops all columns in the DataFrame that contain any of the specified substrings.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame from which to drop columns.
    - substrings (list of str): Substrings to look for in column names.
    
    Returns:
    - pd.DataFrame: The DataFrame with specified columns dropped.
    """
    # Create a boolean mask where True if any substring is in the column name
    mask = df.columns.to_series().apply(lambda x: any(sub in x for sub in substrings))
    # Identify columns to drop
    cols_to_drop = df.columns[mask]
    # Drop the columns
    return df.drop(columns=cols_to_drop)

# Define a helper function
def redefine_ecg_id(df, column_name):
    """
    Redefines the 'ecg_id_new' column by excluding the last 4 characters of each string.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the column to modify.
    - column_name (str): The name of the column to modify.
    
    Returns:
    - pd.DataFrame: The DataFrame with the modified column.
    """
    # Ensure the column is of string type
    df[column_name] = df[column_name].astype(str)
    # Exclude the last 4 characters
    df[column_name] = df[column_name].str[:-4]
    return df

In [31]:
all_ids_labels_tested_with_covars_all = pd.read_csv('all_ids_labels_tested_with_covars_all.csv')
all_ids_labels_tested_with_covars = pd.read_csv('all_ids_labels_tested_with_covars.csv')
all_ids_labels_untested_with_covars_all = pd.read_csv('all_ids_labels_untested_with_covars_all.csv')
all_ids_labels_untested_with_covars = pd.read_csv('all_ids_labels_untested_with_covars.csv')
duplicates = pd.read_csv('ecg_duplicates.csv')

# Step 2: Drop specified columns from main DataFrames before merging
# Define the columns to drop
columns_to_drop = ['file_id', 'duplicate_ids', 'num_duplicates']
# Apply the function to each main DataFrame
all_ids_labels_tested_with_covars_all = drop_columns_containing(
    all_ids_labels_tested_with_covars_all, columns_to_drop)
all_ids_labels_tested_with_covars = drop_columns_containing(
    all_ids_labels_tested_with_covars, columns_to_drop)
all_ids_labels_untested_with_covars_all = drop_columns_containing(
    all_ids_labels_untested_with_covars_all, columns_to_drop)
all_ids_labels_untested_with_covars = drop_columns_containing(
    all_ids_labels_untested_with_covars, columns_to_drop)

# Apply the function to the specified DataFrames
all_ids_labels_untested_with_covars_all = redefine_ecg_id(
    all_ids_labels_untested_with_covars_all, 'ecg_id_new')
all_ids_labels_untested_with_covars = redefine_ecg_id(
    all_ids_labels_untested_with_covars, 'ecg_id_new')

all_ids_labels_tested_with_covars_all = all_ids_labels_tested_with_covars_all.merge(duplicates, left_on='ecg_id_new', right_on='file_id', how='left')
all_ids_labels_tested_with_covars = all_ids_labels_tested_with_covars.merge(duplicates, left_on='ecg_id_new', right_on='file_id', how='left')
all_ids_labels_untested_with_covars_all = all_ids_labels_untested_with_covars_all.merge(duplicates, left_on='ecg_id_new', right_on='file_id', how='left')
all_ids_labels_untested_with_covars = all_ids_labels_untested_with_covars.merge(duplicates, left_on='ecg_id_new', right_on='file_id', how='left')

all_ids_labels_tested_with_covars_all.to_csv('all_ids_labels_tested_with_covars_all.csv')
all_ids_labels_tested_with_covars.to_csv('all_ids_labels_tested_with_covars.csv')
all_ids_labels_untested_with_covars_all.to_csv('all_ids_labels_untested_with_covars_all.csv')
all_ids_labels_untested_with_covars.to_csv('all_ids_labels_untested_with_covars.csv')

In [32]:
all_ids_labels_tested_with_covars_all.columns

Index(['Unnamed: 0.8', 'Unnamed: 0.7', 'Unnamed: 0.6', 'Unnamed: 0.5',
       'Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'patient_ngsci_id', 'ecg_id', 'date', 'p-r-t_axes',
       'p_axes', 'r_axes', 't_axes', 'pr_interval', 'pr_interval_units',
       'qrs_duration', 'qrs_duration_units', 'qtqtc', 'qt_interval',
       'qt_interval_units', 'qtc_interval', 'qtc_interval_units', 'vent_rate',
       'vent_rate_units', 'has_bbb', 'has_afib', 'has_st', 'has_pacemaker',
       'has_lvh', 'has_normal', 'has_normal_ecg', 'has_normal_sinus',
       'has_depress', 'has_st_eleva', 'has_twave', 'has_aberran_bbb',
       'has_jpoint_repol', 'has_jpoint_eleva', 'has_twave_inver',
       'has_twave_abnormal', 'has_nonspecific', 'has_rhythm_disturbance',
       'has_prolonged_qt', 'has_lead_reversal', 'has_poor_or_quality',
       'ecg_id_new', 'ed_enc_id', 'start_datetime', 'end_datetime',
       'age_at_admit', 'macetrop_030_pos', 'death_030_day',
       'm

In [4]:
test_df = all_ids_labels_tested_with_covars_all

In [7]:
test_df[test_df['num_duplicates']>0][['ecg_id','file_id', 'duplicate_ids', 'num_duplicates', 'stent_or_cabg_010_day']]

Unnamed: 0,ecg_id,file_id,duplicate_ids,num_duplicates,stent_or_cabg_010_day
1,ecg2b46428974,2b46428974,['424d8d650d'],1,False
2,ecg424d8d650d,424d8d650d,['2b46428974'],1,False
9,ecgdc840c24ce,dc840c24ce,['5e1934b83d'],1,False
10,ecg5e1934b83d,5e1934b83d,['dc840c24ce'],1,False
24,ecg08a3dfa7ba,08a3dfa7ba,['54ab380ab9'],1,False
...,...,...,...,...,...
7185,ecg32cca3c9dc,32cca3c9dc,['21fd8c2771'],1,False
7200,ecgea552fdb55,ea552fdb55,['5ae58e4b2e'],1,False
7201,ecg110297a510,110297a510,['c07092dfd9'],1,False
7202,ecg5ae58e4b2e,5ae58e4b2e,['ea552fdb55'],1,False


In [12]:
import ast
df = test_df[test_df['num_duplicates']>0]

# Step 2: Ensure 'duplicate_ids' are lists
# If 'duplicate_ids' are stored as strings like "['424d8d650d']", convert them to actual lists
def convert_to_list(dup_ids):
    if pd.isna(dup_ids) or dup_ids == '[]':
        return []
    try:
        return ast.literal_eval(dup_ids)
    except (ValueError, SyntaxError):
        # Handle cases where the string is not a valid list representation
        return [dup_ids]

df['duplicate_ids'] = df['duplicate_ids'].apply(convert_to_list)

# Verify the conversion
print("\nDataFrame after converting 'duplicate_ids' to lists:")
print(df.head())

# Step 3: Create a mapping from file_id to 'stent_or_cabg_010_day' label
file_id_to_label = df.set_index('file_id')['stent_or_cabg_010_day'].to_dict()

# Debug: Print the mapping
print("\nMapping from file_id to 'stent_or_cabg_010_day':")
#print(file_id_to_label)

# Step 4: Define a function to check if all duplicates have the same label
def check_same_label(row):
    current_label = row['stent_or_cabg_010_day']
    duplicates = row['duplicate_ids']
    
    # If there are no duplicates, you can decide whether 'same_label' should be True or False
    # Here, we'll set it to True
    if not duplicates:
        return True
    
    for dup_id in duplicates:
        dup_label = file_id_to_label.get(dup_id)
        if dup_label is None:
            # Handle the case where the duplicate_id is not found in the mapping
            print(f"Warning: Duplicate ID '{dup_id}' not found in the DataFrame.")
            return False
        if dup_label != current_label:
            return False
    return True

# Step 5: Apply the function to create the 'same_label' column
df['same_label'] = df.apply(check_same_label, axis=1)

# Verify the new column
print("\nDataFrame after adding 'same_label' column:")
print(df.head())


DataFrame after converting 'duplicate_ids' to lists:
    Unnamed: 0.6  Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2   
1              1             1             1             1             1  \
2              2             2             2             2             2   
9              9             9             9             9             9   
10            10            10            10            10            10   
24            24            24            24            24            24   

    Unnamed: 0.1  Unnamed: 0 patient_ngsci_id         ecg_id   
1              1         148      pat006375f2  ecg2b46428974  \
2              2         149      pat006375f2  ecg424d8d650d   
9              9         878      pat01b9085e  ecgdc840c24ce   
10            10         879      pat01b9085e  ecg5e1934b83d   
24            24        1418      pat02ed5ebd  ecg08a3dfa7ba   

                    date  ... agi_above_200k  ecg_cnt  female  ste_std_twi   
1   2114-09-13T12:56:17Z  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duplicate_ids'] = df['duplicate_ids'].apply(convert_to_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['same_label'] = df.apply(check_same_label, axis=1)


In [18]:
print(df.shape)
print(df['same_label'].sum())

df[df['same_label']==True][['ecg_id','file_id', 'duplicate_ids', 'num_duplicates', 'stent_or_cabg_010_day', 'same_label']].head(20)

(1793, 94)
1791


Unnamed: 0,ecg_id,file_id,duplicate_ids,num_duplicates,stent_or_cabg_010_day,same_label
1,ecg2b46428974,2b46428974,[424d8d650d],1,False,True
2,ecg424d8d650d,424d8d650d,[2b46428974],1,False,True
9,ecgdc840c24ce,dc840c24ce,[5e1934b83d],1,False,True
10,ecg5e1934b83d,5e1934b83d,[dc840c24ce],1,False,True
24,ecg08a3dfa7ba,08a3dfa7ba,[54ab380ab9],1,False,True
25,ecg54ab380ab9,54ab380ab9,[08a3dfa7ba],1,False,True
26,ecg6bbd791bd5,6bbd791bd5,[271af5d387],1,False,True
27,ecg271af5d387,271af5d387,[6bbd791bd5],1,False,True
55,ecg4b1aca8786,4b1aca8786,[0946b2efb8],1,False,True
56,ecg021e2c81dd,021e2c81dd,[f1cf511f11],1,False,True


In [34]:
import ast
test_df = all_ids_labels_untested_with_covars_all
print(test_df.shape)
display(test_df[['ecg_id_new','file_id', 'duplicate_ids', 'num_duplicates', 'macetrop_pos_or_death_030']])
df = test_df[test_df['num_duplicates']>0]
print(df.shape)

# Step 2: Ensure 'duplicate_ids' are lists
# If 'duplicate_ids' are stored as strings like "['424d8d650d']", convert them to actual lists
def convert_to_list(dup_ids):
    if pd.isna(dup_ids) or dup_ids == '[]':
        return []
    try:
        return ast.literal_eval(dup_ids)
    except (ValueError, SyntaxError):
        # Handle cases where the string is not a valid list representation
        return [dup_ids]

df['duplicate_ids'] = df['duplicate_ids'].apply(convert_to_list)

# Verify the conversion
print("\nDataFrame after converting 'duplicate_ids' to lists:")
print(df.head())

# Step 3: Create a mapping from file_id to 'stent_or_cabg_010_day' label
file_id_to_label = df.set_index('file_id')['macetrop_pos_or_death_030'].to_dict()

# Debug: Print the mapping
print("\nMapping from file_id to 'macetrop_pos_or_death_030':")
#print(file_id_to_label)

# Step 4: Define a function to check if all duplicates have the same label
def check_same_label(row):
    current_label = row['macetrop_pos_or_death_030']
    duplicates = row['duplicate_ids']
    
    # If there are no duplicates, you can decide whether 'same_label' should be True or False
    # Here, we'll set it to True
    if not duplicates:
        return True
    
    for dup_id in duplicates:
        dup_label = file_id_to_label.get(dup_id)
        if dup_label is None:
            # Handle the case where the duplicate_id is not found in the mapping
            print(f"Warning: Duplicate ID '{dup_id}' not found in the DataFrame.")
            return False
        if dup_label != current_label:
            return False
    return True

# Step 5: Apply the function to create the 'same_label' column
df['same_label'] = df.apply(check_same_label, axis=1)

# Verify the new column
print("\nDataFrame after adding 'same_label' column:")
display(df.head())

(93977, 93)


Unnamed: 0,ecg_id_new,file_id,duplicate_ids,num_duplicates,macetrop_pos_or_death_030
0,3906f61865,3906f61865,[],0,False
1,6bddcf866d,6bddcf866d,[],0,False
2,d49df1c4de,d49df1c4de,[],0,False
3,e5331658a6,e5331658a6,[],0,False
4,929fa6f6d4,929fa6f6d4,[],0,False
...,...,...,...,...,...
93972,f8eb59980c,f8eb59980c,[],0,False
93973,79871e21cc,79871e21cc,[],0,False
93974,4b822d9211,4b822d9211,[],0,False
93975,4b7182cb9f,4b7182cb9f,[],0,False


(24850, 93)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duplicate_ids'] = df['duplicate_ids'].apply(convert_to_list)



DataFrame after converting 'duplicate_ids' to lists:
    Unnamed: 0.6  Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2   
8              8             8             8             8             8  \
9              9             9             9             9             9   
11            11            11            11            11            11   
12            12            12            12            12            12   
14            14            14            14            14            14   

    Unnamed: 0.1  Unnamed: 0 patient_ngsci_id             ecg_id   
8              8          20      pat000dfa24  ecg0705cca338.npy  \
9              9          21      pat000dfa24  ecg4fee63bb84.npy   
11            11          23      pat000f3c7a  ecgc42d3157f2.npy   
12            12          24      pat000f3c7a  ecgce8d1cbf43.npy   
14            14          28      pat00116b06  ecga3eac1846c.npy   

                    date  ... agi_100k_to_200k  agi_above_200k  ecg_cnt   
8   2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['same_label'] = df.apply(check_same_label, axis=1)


Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,patient_ngsci_id,ecg_id,date,...,agi_above_200k,ecg_cnt,female,ste_std_twi,split,complete,file_id,duplicate_ids,num_duplicates,same_label
8,8,8,8,8,8,8,20,pat000dfa24,ecg0705cca338.npy,2114-03-13T03:22:50Z,...,0.029685,1,0,False,train,,0705cca338,[4fee63bb84],1,True
9,9,9,9,9,9,9,21,pat000dfa24,ecg4fee63bb84.npy,2114-03-13T03:22:50Z,...,0.029685,1,0,False,train,,4fee63bb84,[0705cca338],1,True
11,11,11,11,11,11,11,23,pat000f3c7a,ecgc42d3157f2.npy,2113-09-26T02:54:47Z,...,0.085967,1,1,False,train,,c42d3157f2,[ce8d1cbf43],1,True
12,12,12,12,12,12,12,24,pat000f3c7a,ecgce8d1cbf43.npy,2113-09-26T02:54:47Z,...,0.085967,1,1,False,train,,ce8d1cbf43,[c42d3157f2],1,True
14,14,14,14,14,14,14,28,pat00116b06,ecga3eac1846c.npy,2113-06-11T11:23:02Z,...,0.027559,1,1,False,train,,a3eac1846c,[a41e48c1ac],1,True


In [37]:
print(df.shape)
print(df['same_label'].sum())

df[df['same_label']==True][['ecg_id','file_id', 'duplicate_ids', 'num_duplicates', 'stent_or_cabg_010_day', 'same_label']].head(20)

(24850, 94)
24845


Unnamed: 0,ecg_id,file_id,duplicate_ids,num_duplicates,stent_or_cabg_010_day,same_label
8,ecg0705cca338.npy,0705cca338,[4fee63bb84],1,False,True
9,ecg4fee63bb84.npy,4fee63bb84,[0705cca338],1,False,True
11,ecgc42d3157f2.npy,c42d3157f2,[ce8d1cbf43],1,False,True
12,ecgce8d1cbf43.npy,ce8d1cbf43,[c42d3157f2],1,False,True
14,ecga3eac1846c.npy,a3eac1846c,[a41e48c1ac],1,False,True
15,ecga41e48c1ac.npy,a41e48c1ac,[a3eac1846c],1,False,True
24,ecgcf02003ab2.npy,cf02003ab2,[78491afae7],1,False,True
25,ecg78491afae7.npy,78491afae7,[cf02003ab2],1,False,True
34,ecg42d02b50e9.npy,42d02b50e9,[c2dd2da23e],1,False,True
35,ecgc2dd2da23e.npy,c2dd2da23e,[42d02b50e9],1,False,True
