In [None]:
! pip install liac-arff

In [None]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [1]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [2]:
path = "/domino/datasets/local/CustomerSegmentation/public_datasets/bibtex/"

In [3]:
# Load the .arff file - With full data in one file
with open(path + 'bibtex.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
bibtex_data = pd.DataFrame(data, columns=columns)

In [4]:
print("Shape of the bibtex_data file :", bibtex_data.shape)

Shape of the bibtex_data file : (7395, 1995)


In [5]:
### Get the position of the first Label columns
col_position = bibtex_data.columns.get_loc("TAG_2005")
print(col_position)

1836


In [None]:
bibtex_data.columns

In [8]:
bibtex_data.iloc[:,:1836].columns

Index(['0', '000', '02', '05', '06', '1', '10', '100', '11', '12',
       ...
       'years', 'yet', 'yield', 'yields', 'you', 'young', 'z', 'zero', 'zu',
       'zur'],
      dtype='object', length=1836)

In [9]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(bibtex_data.iloc[:, :1836].values)
print(unique_values)

['0' '1']


In [10]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(bibtex_data.iloc[:,1837:].values)
print(unique_values)

['0' '1']


In [11]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = bibtex_data.iloc[:, :1836]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
bibtex_data_cleaned = pd.concat([non_constant_cols, bibtex_data.iloc[:, 1836:]], axis=1)

print("Shape of bibtex_data_cleaned :", bibtex_data_cleaned.shape)

Shape of the non_constant_cols : (7395, 1836)
Shape of bibtex_data_cleaned : (7395, 1995)


In [None]:
## No columns were deleted during constant value checks

In [13]:
bibtex_data_cleaned.iloc[:,:1837].columns

Index(['0', '000', '02', '05', '06', '1', '10', '100', '11', '12',
       ...
       'yet', 'yield', 'yields', 'you', 'young', 'z', 'zero', 'zu', 'zur',
       'TAG_2005'],
      dtype='object', length=1837)

In [14]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(1836)]

# Assign the new names to the first 1836 columns
bibtex_data_cleaned.columns.values[:1836] = new_feature_names

In [15]:
bibtex_data_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,TAG_topic7,TAG_topic8,TAG_topic9,TAG_toread,TAG_transition,TAG_visual,TAG_visualization,TAG_web,TAG_web20,TAG_wiki
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = bibtex_data_cleaned.iloc[:, 1836:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

TAG_2005: 0's = 0.9907, 1's = 0.0093
TAG_2006: 0's = 0.9822, 1's = 0.0178
TAG_2007: 0's = 0.9904, 1's = 0.0096
TAG_agdetection: 0's = 0.9859, 1's = 0.0141
TAG_algorithms: 0's = 0.9923, 1's = 0.0077
TAG_amperometry: 0's = 0.9867, 1's = 0.0133
TAG_analysis: 0's = 0.9723, 1's = 0.0277
TAG_and: 0's = 0.9889, 1's = 0.0111
TAG_annotation: 0's = 0.9918, 1's = 0.0082
TAG_antibody: 0's = 0.9817, 1's = 0.0183
TAG_apob: 0's = 0.9604, 1's = 0.0396
TAG_architecture: 0's = 0.9874, 1's = 0.0126
TAG_article: 0's = 0.9918, 1's = 0.0082
TAG_bettasplendens: 0's = 0.9801, 1's = 0.0199
TAG_bibteximport: 0's = 0.9294, 1's = 0.0706
TAG_book: 0's = 0.9870, 1's = 0.0130
TAG_children: 0's = 0.9861, 1's = 0.0139
TAG_classification: 0's = 0.9918, 1's = 0.0082
TAG_clustering: 0's = 0.9823, 1's = 0.0177
TAG_cognition: 0's = 0.9869, 1's = 0.0131
TAG_collaboration: 0's = 0.9900, 1's = 0.0100
TAG_collaborative: 0's = 0.9843, 1's = 0.0157
TAG_community: 0's = 0.9827, 1's = 0.0173
TAG_competition: 0's = 0.9897, 1's = 0.

In [19]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = bibtex_data_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - 1836
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
bibtex_data_cleaned.columns.values[1836:] = new_target_names

In [20]:
bibtex_data_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_150,target_151,target_152,target_153,target_154,target_155,target_156,target_157,target_158,target_159
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
bibtex_data_cleaned.to_csv("/domino/datasets/local/CustomerSegmentation/public_datasets/bibtex/Final_dataset.csv", index=False)