In [None]:
! pip install liac-arff

In [None]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [6]:
import arff
import zipfile
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import io

In [4]:
path = "/domino/datasets/local/CustomerSegmentation/public_datasets/bookmarks/bookmarks.zip"

In [7]:
# Open the zip file and read the .arff file
with zipfile.ZipFile(path, 'r') as z:
    # Optional: list contents of zip
    print(z.namelist())

    with z.open('bookmarks.arff') as f:
        # Wrap binary file stream with text wrapper
        text_stream = io.TextIOWrapper(f, encoding='utf-8')
        dataset = arff.load(text_stream)

# Extract data and attribute names
data = dataset['data']
columns = [attr[0] for attr in dataset['attributes']]

# Convert to DataFrame
bookmarks_data = pd.DataFrame(data, columns=columns)

# Display result
print(bookmarks_data.head())

['bookmarks.arff']
   a aber ability absolutely abstract academic accept accepted access  \
0  0    0       0          1        0        0      0        0      0   
1  0    0       0          0        0        0      0        0      0   
2  0    0       0          0        0        0      0        0      0   
3  0    0       0          0        0        0      0        0      0   
4  0    0       0          0        0        0      0        0      0   

  accessible  ... TAG_web20 TAG_webdesign TAG_webdev TAG_webservice TAG_wiki  \
0          0  ...         0             0          0              0        0   
1          0  ...         0             0          0              0        0   
2          0  ...         0             0          0              0        0   
3          0  ...         0             0          0              0        0   
4          0  ...         0             0          0              0        0   

  TAG_wikipedia TAG_windows TAG_writing TAG_xml TAG_yahoo  
0

In [8]:
print("Shape of the bookmarks_data file :", bookmarks_data.shape)

Shape of the bookmarks_data file : (87856, 2358)


In [9]:
### Get the position of the first Label columns
col_position = bookmarks_data.columns.get_loc("TAG_20")
print(col_position)

2150


In [11]:
bookmarks_data.iloc[:,2150:].columns

Index(['TAG_20', 'TAG_academic', 'TAG_ajax', 'TAG_all', 'TAG_allgemein',
       'TAG_api', 'TAG_apple', 'TAG_art', 'TAG_artery', 'TAG_article',
       ...
       'TAG_web20', 'TAG_webdesign', 'TAG_webdev', 'TAG_webservice',
       'TAG_wiki', 'TAG_wikipedia', 'TAG_windows', 'TAG_writing', 'TAG_xml',
       'TAG_yahoo'],
      dtype='object', length=208)

In [13]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(bookmarks_data.iloc[:, :2150].values)
print(unique_values)

['0' '1']


In [14]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(bookmarks_data.iloc[:,2150:].values)
print(unique_values)

['0' '1']


In [16]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = bookmarks_data.iloc[:, :2150]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
bookmarks_data_cleaned = pd.concat([non_constant_cols, bookmarks_data.iloc[:, 2150:]], axis=1)

print("Shape of bookmarks_data_cleaned :", bookmarks_data_cleaned.shape)

Shape of the non_constant_cols : (87856, 2150)
Shape of bookmarks_data_cleaned : (87856, 2358)


In [None]:
## No columns were deleted during constant value checks

In [19]:
bookmarks_data_cleaned.iloc[:,:2150].columns

Index(['a', 'aber', 'ability', 'absolutely', 'abstract', 'academic', 'accept',
       'accepted', 'access', 'accessible',
       ...
       'young', 'your', 'youtube', 'zeit', 'zip', 'zoom', 'zum', 'zur', 'zwei',
       'zwischen'],
      dtype='object', length=2150)

In [20]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(2150)]

# Assign the new names to the first 1836 columns
bookmarks_data_cleaned.columns.values[:2150] = new_feature_names

In [21]:
bookmarks_data_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,TAG_web20,TAG_webdesign,TAG_webdev,TAG_webservice,TAG_wiki,TAG_wikipedia,TAG_windows,TAG_writing,TAG_xml,TAG_yahoo
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = bookmarks_data_cleaned.iloc[:, 2150:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

TAG_20: 0's = 0.9946, 1's = 0.0054
TAG_academic: 0's = 0.9904, 1's = 0.0096
TAG_ajax: 0's = 0.9903, 1's = 0.0097
TAG_all: 0's = 0.9906, 1's = 0.0094
TAG_allgemein: 0's = 0.9800, 1's = 0.0200
TAG_api: 0's = 0.9949, 1's = 0.0051
TAG_apple: 0's = 0.9965, 1's = 0.0035
TAG_art: 0's = 0.9880, 1's = 0.0120
TAG_artery: 0's = 0.9965, 1's = 0.0035
TAG_article: 0's = 0.9833, 1's = 0.0167
TAG_audio: 0's = 0.9958, 1's = 0.0042
TAG_bibliography: 0's = 0.9957, 1's = 0.0043
TAG_bibliothek: 0's = 0.9949, 1's = 0.0051
TAG_blog: 0's = 0.9623, 1's = 0.0377
TAG_blogging: 0's = 0.9918, 1's = 0.0082
TAG_blogs: 0's = 0.9799, 1's = 0.0201
TAG_book: 0's = 0.9955, 1's = 0.0045
TAG_bookmarking: 0's = 0.9964, 1's = 0.0036
TAG_bookmarks: 0's = 0.9937, 1's = 0.0063
TAG_bookmarkstoolbarfolder: 0's = 0.9959, 1's = 0.0041
TAG_books: 0's = 0.9297, 1's = 0.0703
TAG_business: 0's = 0.9938, 1's = 0.0062
TAG_c: 0's = 0.9951, 1's = 0.0049
TAG_cardiology: 0's = 0.9959, 1's = 0.0041
TAG_cardiovascular: 0's = 0.9959, 1's = 0.00

In [23]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = bookmarks_data_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - 2150
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
bookmarks_data_cleaned.columns.values[2150:] = new_target_names

In [24]:
bookmarks_data_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_199,target_200,target_201,target_202,target_203,target_204,target_205,target_206,target_207,target_208
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
2358 - 2150

208

In [25]:
bookmarks_data_cleaned.to_csv(
    "/domino/datasets/local/CustomerSegmentation/public_datasets/bookmarks/Final_dataset.zip",
    index=False,
    compression='zip'
)