In [None]:
! pip install liac-arff

In [None]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [1]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [2]:
path = "/repos/smote_msfb/public_datasets/corel5k/"

In [3]:
# Load the .arff file - With full data in one file
with open(path + 'Corel5k.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
df = pd.DataFrame(data, columns=columns)

In [4]:
print("Shape of the corel5k file :", df.shape)

Shape of the corel5k file : (5000, 873)


In [5]:
df.columns

Index(['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6',
       'Cluster7', 'Cluster8', 'Cluster9', 'Cluster10',
       ...
       'reptile', 'snake', 'cougar', 'oahu', 'kauai', 'maui', 'school',
       'canoe', 'race', 'hawaii'],
      dtype='object', length=873)

In [6]:
### Get the position of the first Label columns
col_position = df.columns.get_loc("city")
print(col_position)

499


In [7]:
df.columns

Index(['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6',
       'Cluster7', 'Cluster8', 'Cluster9', 'Cluster10',
       ...
       'reptile', 'snake', 'cougar', 'oahu', 'kauai', 'maui', 'school',
       'canoe', 'race', 'hawaii'],
      dtype='object', length=873)

In [8]:
df.iloc[:,:col_position].columns

Index(['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6',
       'Cluster7', 'Cluster8', 'Cluster9', 'Cluster10',
       ...
       'Cluster490', 'Cluster491', 'Cluster492', 'Cluster493', 'Cluster494',
       'Cluster495', 'Cluster496', 'Cluster497', 'Cluster498', 'Cluster499'],
      dtype='object', length=499)

In [9]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(df.iloc[:, :col_position].values)
print(unique_values)

['0' '1']


In [10]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(df.iloc[:,col_position:].values)
print(unique_values)

['0' '1']


In [11]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = df.iloc[:, :col_position]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
df_cleaned = pd.concat([non_constant_cols, df.iloc[:, col_position:]], axis=1)

print("Shape of df_cleaned :", df_cleaned.shape)

Shape of the non_constant_cols : (5000, 499)
Shape of df_cleaned : (5000, 873)


In [12]:
## No columns were deleted during constant value checks

In [13]:
df.iloc[:,:col_position].columns

Index(['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6',
       'Cluster7', 'Cluster8', 'Cluster9', 'Cluster10',
       ...
       'Cluster490', 'Cluster491', 'Cluster492', 'Cluster493', 'Cluster494',
       'Cluster495', 'Cluster496', 'Cluster497', 'Cluster498', 'Cluster499'],
      dtype='object', length=499)

In [14]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(col_position)]

# Assign the new names to the first 1836 columns
df_cleaned.columns.values[:col_position] = new_feature_names

In [15]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,reptile,snake,cougar,oahu,kauai,maui,school,canoe,race,hawaii
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Slice only the label columns
label_data = df_cleaned.iloc[:, col_position:]

# Create a list to hold columns to keep
columns_to_keep = []

# Calculate proportions of 0's and 1's for each column and decide which columns to keep
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    if prop_1 >= 0.05:
        columns_to_keep.append(col)

# Filter label_data to only include columns with prop_1 >= 0.01
filtered_label_data = label_data[columns_to_keep]

# Print proportions of 0's and 1's for the remaining columns
for col in filtered_label_data.columns:
    value_counts = filtered_label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

mountain: 0's = 0.9310, 1's = 0.0690
sky: 0's = 0.8024, 1's = 0.1976
water: 0's = 0.7760, 1's = 0.2240
clouds: 0's = 0.9440, 1's = 0.0560
tree: 0's = 0.8106, 1's = 0.1894
people: 0's = 0.8512, 1's = 0.1488
grass: 0's = 0.9006, 1's = 0.0994
buildings: 0's = 0.9076, 1's = 0.0924
snow: 0's = 0.9404, 1's = 0.0596
flowers: 0's = 0.9408, 1's = 0.0592
rocks: 0's = 0.9500, 1's = 0.0500


In [21]:
# Assuming col_position is the index where label columns start in df_cleaned
# filtered_label_data contains only the label columns you want to rename

# Generate new names for label columns
new_target_names = [f"target_{i+1}" for i in range(filtered_label_data.shape[1])]

# Rename the label columns in filtered_label_data
filtered_label_data.columns = new_target_names

# If you want to update df_cleaned as well:
df_cleaned.columns.values[col_position:col_position+filtered_label_data.shape[1]] = new_target_names

In [23]:
filtered_label_data.head(3)

Unnamed: 0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11
0,1,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# Concatenate the feature columns (left) with the filtered label columns (right)
final_df = pd.concat([df_cleaned.iloc[:, :col_position], filtered_label_data], axis=1)

In [25]:
final_df.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_cleaned.columns = [str(col) for col in df_cleaned.columns]

In [27]:
# Save as CSV
df_cleaned.to_csv(
    path + "processed_dataset.csv",
    index=False
)

In [None]:
# Save as Parquet (compressed with snappy)
df_cleaned.to_parquet(
    path + "processed_dataset.parquet", 
    engine="pyarrow", 
    compression="snappy", 
    index=False
)