In [1]:
! pip install liac-arff



In [2]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [3]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [4]:
path = "/repos/smote_msfb/public_datasets/medical/"

In [5]:
# Load the .arff file - With full data in one file
with open(path + 'medical.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
df = pd.DataFrame(data, columns=columns)

In [6]:
print("Shape of the medical.arff file :", df.shape)

Shape of the medical.arff file : (978, 1494)


In [7]:
### Get the position of the first Label columns
col_position = df.columns.get_loc("Class-0-593_70")  ## The first label column in the dataset
print(col_position)

1449


In [8]:
df.columns

Index(['-', '/', '0', '00', '04', '0;', '0cm', '1', '1-1/2', '1-1/2-year',
       ...
       'Class-35-493_90', 'Class-36-788_30', 'Class-37-753_3',
       'Class-38-593_89', 'Class-39-758_6', 'Class-40-741_90', 'Class-41-591',
       'Class-42-599_7', 'Class-43-279_12', 'Class-44-786_07'],
      dtype='object', length=1494)

In [9]:
df.iloc[:,:col_position+1].columns

Index(['-', '/', '0', '00', '04', '0;', '0cm', '1', '1-1/2', '1-1/2-year',
       ...
       'x5', 'xray', 'year', 'year-old', 'yearly', 'years', 'yesterday',
       'zithromax', 'zone', 'Class-0-593_70'],
      dtype='object', length=1450)

In [10]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(df.iloc[:, :col_position].values)
print(unique_values)

['0' '1']


In [11]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(df.iloc[:,col_position:].values)
print(unique_values)

['0' '1']


In [12]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = df.iloc[:, :col_position]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
df_cleaned = pd.concat([non_constant_cols, df.iloc[:, col_position:]], axis=1)

print("Shape of df_cleaned :", df_cleaned.shape)

Shape of the non_constant_cols : (978, 1449)
Shape of df_cleaned : (978, 1494)


In [13]:
## No columns were deleted during constant value checks

In [14]:
df.iloc[:,:col_position].columns

Index(['-', '/', '0', '00', '04', '0;', '0cm', '1', '1-1/2', '1-1/2-year',
       ...
       'x2', 'x5', 'xray', 'year', 'year-old', 'yearly', 'years', 'yesterday',
       'zithromax', 'zone'],
      dtype='object', length=1449)

In [15]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(col_position)]

# Assign the new names to the first 1836 columns
df_cleaned.columns.values[:col_position] = new_feature_names

In [16]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,Class-35-493_90,Class-36-788_30,Class-37-753_3,Class-38-593_89,Class-39-758_6,Class-40-741_90,Class-41-591,Class-42-599_7,Class-43-279_12,Class-44-786_07
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [17]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = df_cleaned.iloc[:, col_position:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

Class-0-593_70: 0's = 0.8947, 1's = 0.1053
Class-1-079_99: 0's = 0.9888, 1's = 0.0112
Class-2-786_09: 0's = 0.9969, 1's = 0.0031
Class-3-759_89: 0's = 0.9980, 1's = 0.0020
Class-4-753_0: 0's = 0.7280, 1's = 0.2720
Class-5-786_2: 0's = 0.9990, 1's = 0.0010
Class-6-V72_5: 0's = 0.9990, 1's = 0.0010
Class-7-511_9: 0's = 0.9980, 1's = 0.0020
Class-8-596_8: 0's = 0.9990, 1's = 0.0010
Class-9-599_0: 0's = 0.8845, 1's = 0.1155
Class-10-518_0: 0's = 0.9836, 1's = 0.0164
Class-11-593_5: 0's = 0.9898, 1's = 0.0102
Class-12-V13_09: 0's = 0.9939, 1's = 0.0061
Class-13-791_0: 0's = 0.9980, 1's = 0.0020
Class-14-789_00: 0's = 0.9918, 1's = 0.0082
Class-15-593_1: 0's = 0.9980, 1's = 0.0020
Class-16-462: 0's = 0.9969, 1's = 0.0031
Class-17-592_0: 0's = 0.9918, 1's = 0.0082
Class-18-786_59: 0's = 0.9990, 1's = 0.0010
Class-19-785_6: 0's = 0.9939, 1's = 0.0061
Class-20-V67_09: 0's = 0.9990, 1's = 0.0010
Class-21-795_5: 0's = 0.9826, 1's = 0.0174
Class-22-789_09: 0's = 0.9959, 1's = 0.0041
Class-23-786_5

In [18]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = df_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - col_position
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
df_cleaned.columns.values[col_position:] = new_target_names

In [19]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_36,target_37,target_38,target_39,target_40,target_41,target_42,target_43,target_44,target_45
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [20]:
df_cleaned.to_csv(path + "/Final_dataset.csv", index=False)

In [22]:
df_cleaned.columns = [str(col) for col in df_cleaned.columns]

In [23]:
# Save as Parquet (compressed with snappy)
df_cleaned.to_parquet(
    path + "/processed_dataset.parquet", 
    engine="pyarrow", 
    compression="snappy", 
    index=False
)