In [1]:
! pip install liac-arff



In [2]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [3]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [4]:
path = "/repos/smote_msfb/public_datasets/genbase/"

In [5]:
# Load the .arff file - With full data in one file
with open(path + 'genbase.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
df = pd.DataFrame(data, columns=columns)

In [6]:
print("Shape of the delicious.arff file :", df.shape)

Shape of the delicious.arff file : (662, 1213)


In [7]:
### Get the position of the first Label columns
col_position = df.columns.get_loc("PDOC00154")  ## The first label column in the dataset
print(col_position)

1186


In [8]:
df.columns

Index(['protein', 'PS00010', 'PS00011', 'PS00012', 'PS00014', 'PS00017',
       'PS00018', 'PS00019', 'PS00020', 'PS00021',
       ...
       'PDOC00662', 'PDOC00018', 'PDOC50001', 'PDOC00014', 'PDOC00750',
       'PDOC50196', 'PDOC50199', 'PDOC00660', 'PDOC00653', 'PDOC00030'],
      dtype='object', length=1213)

In [9]:
df.iloc[:,:col_position].columns

Index(['protein', 'PS00010', 'PS00011', 'PS00012', 'PS00014', 'PS00017',
       'PS00018', 'PS00019', 'PS00020', 'PS00021',
       ...
       'PS50821', 'PS50822', 'PS50823', 'PS50824', 'PS50825', 'PS50826',
       'PS50827', 'PS50829', 'PS50830', 'PS60000'],
      dtype='object', length=1186)

In [10]:
### This dataset needs some extre processing

## 1. Drop the column named : protein extra unique identifier column
df.drop(columns=['protein'], inplace=True)
col_position = col_position - 1 ## Since we deleted the first column. All shifted by 1


## 2. The covariate space contains 'NO' -> 0 and 'YES' -> 1
df = df.replace({'NO': 0, 'YES': 1})


In [11]:
df['PDOC00154'].unique()

array(['1', '0'], dtype=object)

In [12]:
df.head(10)

Unnamed: 0,PS00010,PS00011,PS00012,PS00014,PS00017,PS00018,PS00019,PS00020,PS00021,PS00022,...,PDOC00662,PDOC00018,PDOC50001,PDOC00014,PDOC00750,PDOC50196,PDOC50199,PDOC00660,PDOC00653,PDOC00030
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(df.iloc[:, :col_position].values)
print(unique_values)

[0 1]


In [14]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(df.iloc[:,col_position:].values)
print(unique_values)

['0' '1']


In [15]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = df.iloc[:, :col_position]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
df_cleaned = pd.concat([non_constant_cols, df.iloc[:, col_position:]], axis=1)

print("Shape of df_cleaned :", df_cleaned.shape)

Shape of the non_constant_cols : (662, 112)
Shape of df_cleaned : (662, 139)


In [16]:
## WE REMOVE A LOT OF COLUMNS WITH SAME VALUE ACROSS ALL ROWS. ALMOST 90% OF THE COLUMNS ARE GONE

col_position = non_constant_cols.shape[1]

col_position

112

In [17]:
df_cleaned.iloc[:,col_position:].head(9)

Unnamed: 0,PDOC00154,PDOC00343,PDOC00271,PDOC00064,PDOC00791,PDOC00380,PDOC50007,PDOC00224,PDOC00100,PDOC00670,...,PDOC00662,PDOC00018,PDOC50001,PDOC00014,PDOC00750,PDOC50196,PDOC50199,PDOC00660,PDOC00653,PDOC00030
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_cleaned.iloc[:,:col_position+1].columns

Index(['PS00014', 'PS00017', 'PS00018', 'PS00027', 'PS00066', 'PS00120',
       'PS00136', 'PS00190', 'PS00197', 'PS00215',
       ...
       'PS50322', 'PS50323', 'PS50324', 'PS50325', 'PS50326', 'PS50328',
       'PS50600', 'PS50804', 'PS50821', 'PDOC00154'],
      dtype='object', length=113)

In [19]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(col_position)]

# Assign the new names to the first 1836 columns
df_cleaned.columns.values[:col_position] = new_feature_names

In [20]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,PDOC00662,PDOC00018,PDOC50001,PDOC00014,PDOC00750,PDOC50196,PDOC50199,PDOC00660,PDOC00653,PDOC00030
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = df_cleaned.iloc[:, col_position:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

PDOC00154: 0's = 0.8807, 1's = 0.1193
PDOC00343: 0's = 0.8852, 1's = 0.1148
PDOC00271: 0's = 0.9063, 1's = 0.0937
PDOC00064: 0's = 0.9260, 1's = 0.0740
PDOC00791: 0's = 0.7417, 1's = 0.2583
PDOC00380: 0's = 0.9653, 1's = 0.0347
PDOC50007: 0's = 0.9532, 1's = 0.0468
PDOC00224: 0's = 0.9230, 1's = 0.0770
PDOC00100: 0's = 0.9909, 1's = 0.0091
PDOC00670: 0's = 0.9003, 1's = 0.0997
PDOC50002: 0's = 0.9502, 1's = 0.0498
PDOC50106: 0's = 0.9562, 1's = 0.0438
PDOC00561: 0's = 0.9456, 1's = 0.0544
PDOC50017: 0's = 0.9789, 1's = 0.0211
PDOC50003: 0's = 0.9789, 1's = 0.0211
PDOC50006: 0's = 0.9940, 1's = 0.0060
PDOC50156: 0's = 0.9743, 1's = 0.0257
PDOC00662: 0's = 0.9381, 1's = 0.0619
PDOC00018: 0's = 0.9864, 1's = 0.0136
PDOC50001: 0's = 0.9924, 1's = 0.0076
PDOC00014: 0's = 0.9970, 1's = 0.0030
PDOC00750: 0's = 0.9955, 1's = 0.0045
PDOC50196: 0's = 0.9970, 1's = 0.0030
PDOC50199: 0's = 0.9985, 1's = 0.0015
PDOC00660: 0's = 0.9985, 1's = 0.0015
PDOC00653: 0's = 0.9985, 1's = 0.0015
PDOC00030: 0

In [22]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = df_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - col_position
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
df_cleaned.columns.values[col_position:] = new_target_names

In [23]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_18,target_19,target_20,target_21,target_22,target_23,target_24,target_25,target_26,target_27
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df_cleaned.to_csv(path + "/Final_dataset.csv", index=False)

In [25]:
df_cleaned.columns = [str(col) for col in df_cleaned.columns]

In [26]:
# Save as Parquet (compressed with snappy)
df_cleaned.to_parquet(
    path + "/processed_dataset.parquet", 
    engine="pyarrow", 
    compression="snappy", 
    index=False
)