In [1]:
! pip install liac-arff



In [2]:
! pip install --user pandas pyarrow fastparquet



In [3]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [4]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

import sys
sys.path.append('/repos/smote_msfb/helper_functions')
from summarize_dataset_with_hubness import summarize_dataset_with_hubness


In [5]:
path = "/repos/smote_msfb/public_datasets/bibtex/"

In [6]:
# Load the .arff file - With full data in one file
with open(path + 'bibtex.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
bibtex_data = pd.DataFrame(data, columns=columns)

In [7]:
print("Shape of the bibtex_data file :", bibtex_data.shape)

Shape of the bibtex_data file : (7395, 1995)


In [8]:
### Get the position of the first Label columns
col_position = bibtex_data.columns.get_loc("TAG_2005")
print(col_position)

1836


In [9]:
bibtex_data.columns

Index(['0', '000', '02', '05', '06', '1', '10', '100', '11', '12',
       ...
       'TAG_topic7', 'TAG_topic8', 'TAG_topic9', 'TAG_toread',
       'TAG_transition', 'TAG_visual', 'TAG_visualization', 'TAG_web',
       'TAG_web20', 'TAG_wiki'],
      dtype='object', length=1995)

In [10]:
bibtex_data.iloc[:,:1836].columns

Index(['0', '000', '02', '05', '06', '1', '10', '100', '11', '12',
       ...
       'years', 'yet', 'yield', 'yields', 'you', 'young', 'z', 'zero', 'zu',
       'zur'],
      dtype='object', length=1836)

In [11]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(bibtex_data.iloc[:, :1836].values)
print(unique_values)

['0' '1']


In [12]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(bibtex_data.iloc[:,1837:].values)
print(unique_values)

['0' '1']


In [13]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(1836)]

# Assign the new names to the first 1836 columns
bibtex_data.columns.values[:1836] = new_feature_names

In [14]:
bibtex_data.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,TAG_topic7,TAG_topic8,TAG_topic9,TAG_toread,TAG_transition,TAG_visual,TAG_visualization,TAG_web,TAG_web20,TAG_wiki
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
bibtex_data = bibtex_data.astype(int)

In [16]:
### Code to remove the highly imbalanced responses

# 1. Slice only the response variable columns (assume binary 0/1)
response_data = bibtex_data.iloc[:, 1836:]

# 2. Find response columns with >= 1% proportion of 1s
columns_to_keep = []
for col in response_data.columns:
    prop_1 = (response_data[col] == 1).mean()
    if prop_1 >= 0.03:
        columns_to_keep.append(col)

# 3. Filter the response columns to keep only those with enough 1s
filtered_response_data = response_data[columns_to_keep]

In [17]:
response_data.head(3)

Unnamed: 0,TAG_2005,TAG_2006,TAG_2007,TAG_agdetection,TAG_algorithms,TAG_amperometry,TAG_analysis,TAG_and,TAG_annotation,TAG_antibody,...,TAG_topic7,TAG_topic8,TAG_topic9,TAG_toread,TAG_transition,TAG_visual,TAG_visualization,TAG_web,TAG_web20,TAG_wiki
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# 4. Concatenate the feature columns and filtered response columns
final_bibtex_data = pd.concat(
    [bibtex_data.iloc[:, :1836], filtered_response_data],
    axis=1
)

# (Optional) Print proportions of 1s for the kept response columns
for col in filtered_response_data.columns:
    prop_1 = (filtered_response_data[col] == 1).mean()
    print(f"{col}: 1's proportion = {prop_1:.4f}")

TAG_apob: 1's proportion = 0.0396
TAG_bibteximport: 1's proportion = 0.0706
TAG_design: 1's proportion = 0.0315
TAG_evolution: 1's proportion = 0.0398
TAG_immunoassay: 1's proportion = 0.0338
TAG_learning: 1's proportion = 0.0415
TAG_model: 1's proportion = 0.0319
TAG_ontology: 1's proportion = 0.0315
TAG_semantic: 1's proportion = 0.0315
TAG_software: 1's proportion = 0.0599
TAG_statphys23: 1's proportion = 0.1409
TAG_web: 1's proportion = 0.0314


In [19]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = final_bibtex_data.shape[1]

# Generate new names for response variables
num_targets = total_cols - 1836
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
final_bibtex_data.columns.values[1836:] = new_target_names

In [20]:
final_bibtex_data.iloc[:, 1835:].head(3)

Unnamed: 0,f_1835,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12
0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0


In [21]:
## Reset multi index before storing the data
final_bibtex_data.columns = [str(col) for col in final_bibtex_data.columns]

In [22]:
# Save in Parquet format (using pyarrow)
final_bibtex_data.to_parquet( path + "processed_dataset.parquet", compression="snappy", engine="pyarrow", index=False)

In [None]:
############# Create the summary metrics for this dataset #####################

In [23]:
## Create summary metric dataset for this public dataset
data = pd.read_parquet( path + "processed_dataset.parquet" , engine="pyarrow")

In [24]:
data.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
summary_df = summarize_dataset_with_hubness(data, k=5)

In [None]:
summary_df.head(3)