In [None]:
! pip install liac-arff

In [None]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [1]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [2]:
path = "/repos/smote_msfb/public_datasets/tmc2007/"

In [3]:
# Load the .arff file - With full data in one file
with open(path + 'tmc2007.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
df = pd.DataFrame(data, columns=columns)

In [4]:
df = df.astype(int)

In [5]:
print("Shape of the medical.arff file :", df.shape)

Shape of the medical.arff file : (28596, 49082)


In [6]:
### Get the position of the first Label columns
col_position = df.columns.get_loc("class01")  ## The first label column in the dataset
print(col_position)

49060


In [7]:
df.columns

Index(['0', '00', '000', '001', '0015', '002', '004', '005', '006', '007',
       ...
       'class13', 'class14', 'class15', 'class16', 'class17', 'class18',
       'class19', 'class20', 'class21', 'class22'],
      dtype='object', length=49082)

In [8]:
# Assume df is your DataFrame with only binary variables (0/1)
threshold = 0.01  # 1%

# Calculate the proportion of 1's in each column
proportion_ones = df.mean(axis=0)

# Select columns where the proportion of 1's is at least 1%
columns_to_keep = proportion_ones[proportion_ones >= threshold].index

# Subset the DataFrame to keep only these columns
df_filtered = df[columns_to_keep]

# If you want to update df in-place:
#df = df_filtered

In [13]:
df_bkp = df.copy(deep=True)

In [14]:
df_filtered.shape

(28596, 1256)

In [15]:
df = df_filtered

In [22]:
import re
# Traverse columns from the end
for idx in reversed(range(len(df.columns))):
    col = df.columns[idx]
    if re.fullmatch(r'class\d+', col):  # matches 'class' followed by one or more digits
        print(f"Column name: {col}, Column number (0-based): {idx}")
        break
else:
    print("No column matching the pattern 'class*' (with * as a number) was found.")

Column name: class22, Column number (0-based): 1255


In [25]:
df.iloc[:,1234:]

Unnamed: 0,class01,class02,class03,class04,class05,class06,class07,class08,class09,class10,...,class13,class14,class15,class16,class17,class18,class19,class20,class21,class22
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28591,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
28592,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
28593,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
28594,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [26]:
col_position = 1234

In [27]:
df.iloc[:,:col_position+1].columns

Index(['_', 'a', 'abc', 'able', 'abnormal', 'abort', 'about', 'above',
       'accelerate', 'accept',
       ...
       'yellow', 'yes', 'yet', 'yoke', 'you', 'your', 'z', 'zero', 'zzz',
       'class01'],
      dtype='object', length=1235)

In [28]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(df.iloc[:, :col_position].values)
print(unique_values)

[0 1]


In [29]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(df.iloc[:,col_position:].values)
print(unique_values)

[0 1]


In [32]:
df.iloc[:,:col_position+1].columns

Index(['_', 'a', 'abc', 'able', 'abnormal', 'abort', 'about', 'above',
       'accelerate', 'accept',
       ...
       'yellow', 'yes', 'yet', 'yoke', 'you', 'your', 'z', 'zero', 'zzz',
       'class01'],
      dtype='object', length=1235)

In [33]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(col_position)]

# Assign the new names to the first 1836 columns
df.columns.values[:col_position] = new_feature_names

In [35]:
df.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,class13,class14,class15,class16,class17,class18,class19,class20,class21,class22
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [38]:
df.drop('class02', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('class02', axis=1, inplace=True)


In [39]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = df.iloc[:, col_position:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

class01: 0's = 0.9344, 1's = 0.0656
class03: 0's = 0.9785, 1's = 0.0215
class04: 0's = 0.9787, 1's = 0.0213
class05: 0's = 0.8631, 1's = 0.1369
class06: 0's = 0.7320, 1's = 0.2680
class07: 0's = 0.9220, 1's = 0.0780
class08: 0's = 0.9005, 1's = 0.0995
class09: 0's = 0.9800, 1's = 0.0200
class10: 0's = 0.9491, 1's = 0.0509
class11: 0's = 0.9820, 1's = 0.0180
class12: 0's = 0.8505, 1's = 0.1495
class13: 0's = 0.9004, 1's = 0.0996
class14: 0's = 0.9422, 1's = 0.0578
class15: 0's = 0.9822, 1's = 0.0178
class16: 0's = 0.9563, 1's = 0.0437
class17: 0's = 0.9806, 1's = 0.0194
class18: 0's = 0.9479, 1's = 0.0521
class19: 0's = 0.7016, 1's = 0.2984
class20: 0's = 0.9694, 1's = 0.0306
class21: 0's = 0.9846, 1's = 0.0154
class22: 0's = 0.9718, 1's = 0.0282


In [42]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = df.shape[1]

# Generate new names for response variables
num_targets = total_cols - col_position
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
df.columns.values[col_position:] = new_target_names

In [44]:
df.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_12,target_13,target_14,target_15,target_16,target_17,target_18,target_19,target_20,target_21
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [46]:
df.columns = [str(col) for col in df.columns]

In [47]:
# Select only the response variable columns (those that start with 'target_')
response_cols = [col for col in df.columns if col.startswith('target_')]

# Identify which response columns have at least 1% 1's
cols_to_keep = [
    col for col in response_cols 
    if (df[col] == 1).mean() >= 0.03
]

# Combine covariate columns and filtered response columns
covariate_cols = [col for col in df.columns if not col.startswith('target_')]
filtered_df = df[covariate_cols + cols_to_keep]

# Optional: print kept target columns and their proportions
for col in cols_to_keep:
    prop_1 = (df[col] == 1).mean()
    print(f"{col}: proportion of 1s = {prop_1:.4f}")

target_1: proportion of 1s = 0.0656
target_4: proportion of 1s = 0.1369
target_5: proportion of 1s = 0.2680
target_6: proportion of 1s = 0.0780
target_7: proportion of 1s = 0.0995
target_9: proportion of 1s = 0.0509
target_11: proportion of 1s = 0.1495
target_12: proportion of 1s = 0.0996
target_13: proportion of 1s = 0.0578
target_15: proportion of 1s = 0.0437
target_17: proportion of 1s = 0.0521
target_18: proportion of 1s = 0.2984
target_19: proportion of 1s = 0.0306


In [48]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = filtered_df.shape[1]

# Generate new names for response variables
num_targets = total_cols - col_position
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
filtered_df.columns.values[col_position:] = new_target_names

In [49]:
filtered_df.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [50]:
filtered_df.columns = [str(col) for col in filtered_df.columns]

In [52]:
# Save as Parquet (compressed with snappy)
filtered_df.to_parquet(
    path + "/processed_dataset.parquet", 
    engine="pyarrow", 
    compression="snappy", 
    index=False
)

In [53]:
filtered_df.to_csv(path + "/Final_dataset.zip", index=False, compression='zip')

In [54]:
# Read the Parquet file into a DataFrame
df_loaded = pd.read_parquet(path + "/processed_dataset.parquet", engine="pyarrow")

# Display the first 5 rows
print(df_loaded.head())

   f_0  f_1  f_2  f_3  f_4  f_5  f_6  f_7  f_8  f_9  ...  target_4  target_5  \
0    1    1    0    0    0    0    0    0    0    0  ...         0         0   
1    0    1    0    0    0    0    0    0    0    0  ...         0         0   
2    1    0    0    0    0    0    0    0    0    0  ...         0         0   
3    1    1    0    0    0    0    0    0    0    0  ...         0         0   
4    0    1    0    0    0    0    0    0    0    0  ...         0         0   

   target_6  target_7  target_8  target_9  target_10  target_11  target_12  \
0         0         0         0         0          0          0          1   
1         1         0         0         0          0          0          1   
2         0         0         0         0          0          0          1   
3         0         1         0         0          0          0          0   
4         0         0         0         0          0          0          1   

   target_13  
0          0  
1          0  
2    