In [None]:
! pip install liac-arff

In [None]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [1]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [2]:
path = "/repos/smote_msfb/public_datasets/delicious/"

In [3]:
# Load the .arff file - With full data in one file
with open(path + 'delicious.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
df = pd.DataFrame(data, columns=columns)

In [4]:
print("Shape of the delicious.arff file :", df.shape)

Shape of the delicious.arff file : (16105, 1483)


In [5]:
### Get the position of the first Label columns
col_position = df.columns.get_loc("TAG_.imported")
print(col_position)

500


In [6]:
df.columns

Index(['_qacct', 'accessing', 'actionscript', 'activerecord', 'addoverlay',
       'afternoon', 'against', 'air', 'ajax_action', 'als',
       ...
       'TAG_words', 'TAG_work', 'TAG_world', 'TAG_wp', 'TAG_writing',
       'TAG_xhtml', 'TAG_xml', 'TAG_xp', 'TAG_yahoo', 'TAG_youtube'],
      dtype='object', length=1483)

In [7]:
df.iloc[:,:col_position].columns

Index(['_qacct', 'accessing', 'actionscript', 'activerecord', 'addoverlay',
       'afternoon', 'against', 'air', 'ajax_action', 'als',
       ...
       'writeln', 'x3c', 'x3e', 'xmldata', 'yay', 'yeah', 'zelenka', 'zu',
       'zum', 'zur'],
      dtype='object', length=500)

In [8]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(df.iloc[:, :col_position].values)
print(unique_values)

['0' '1']


In [9]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(df.iloc[:,col_position:].values)
print(unique_values)

['0' '1']


In [10]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = df.iloc[:, :col_position]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
df_cleaned = pd.concat([non_constant_cols, df.iloc[:, col_position:]], axis=1)

print("Shape of df_cleaned :", df_cleaned.shape)

Shape of the non_constant_cols : (16105, 500)
Shape of df_cleaned : (16105, 1483)


In [11]:
## No columns were deleted during constant value checks

In [12]:
df.iloc[:,:col_position].columns

Index(['_qacct', 'accessing', 'actionscript', 'activerecord', 'addoverlay',
       'afternoon', 'against', 'air', 'ajax_action', 'als',
       ...
       'writeln', 'x3c', 'x3e', 'xmldata', 'yay', 'yeah', 'zelenka', 'zu',
       'zum', 'zur'],
      dtype='object', length=500)

In [13]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(col_position)]

# Assign the new names to the first 1836 columns
df_cleaned.columns.values[:col_position] = new_feature_names

In [14]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,TAG_words,TAG_work,TAG_world,TAG_wp,TAG_writing,TAG_xhtml,TAG_xml,TAG_xp,TAG_yahoo,TAG_youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = df_cleaned.iloc[:, col_position:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

TAG_.imported: 0's = 0.9964, 1's = 0.0036
TAG_.net: 0's = 0.9807, 1's = 0.0193
TAG_2.0: 0's = 0.9870, 1's = 0.0130
TAG_2007: 0's = 0.9957, 1's = 0.0043
TAG_3d: 0's = 0.9852, 1's = 0.0148
TAG_??: 0's = 0.9952, 1's = 0.0048
TAG_???: 0's = 0.9970, 1's = 0.0030
TAG_????: 0's = 0.9973, 1's = 0.0027
TAG_academia: 0's = 0.9972, 1's = 0.0028
TAG_academic: 0's = 0.9855, 1's = 0.0145
TAG_access: 0's = 0.9973, 1's = 0.0027
TAG_accessibility: 0's = 0.9886, 1's = 0.0114
TAG_accessories: 0's = 0.9905, 1's = 0.0095
TAG_accounts: 0's = 0.9968, 1's = 0.0032
TAG_actionscript: 0's = 0.9922, 1's = 0.0078
TAG_activism: 0's = 0.9745, 1's = 0.0255
TAG_ad: 0's = 0.9942, 1's = 0.0058
TAG_addon: 0's = 0.9983, 1's = 0.0017
TAG_addons: 0's = 0.9987, 1's = 0.0013
TAG_admin: 0's = 0.9865, 1's = 0.0135
TAG_administration: 0's = 0.9925, 1's = 0.0075
TAG_adobe: 0's = 0.9889, 1's = 0.0111
TAG_ads: 0's = 0.9906, 1's = 0.0094
TAG_adsense: 0's = 0.9943, 1's = 0.0057
TAG_adult: 0's = 0.9974, 1's = 0.0026
TAG_advertising: 0

In [16]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = df_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - col_position
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
df_cleaned.columns.values[col_position:] = new_target_names

In [17]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_974,target_975,target_976,target_977,target_978,target_979,target_980,target_981,target_982,target_983
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_cleaned.columns = [str(col) for col in df_cleaned.columns]

In [19]:
# Save in Parquet format (using pyarrow)
df_cleaned.to_parquet( path + "processed_dataset.parquet", compression="snappy", engine="pyarrow", index=False)