In [None]:
! pip install liac-arff

In [None]:
! pip install --user pandas pyarrow fastparquet

In [None]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [7]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

import sys
sys.path.append('/repos/smote_msfb/helper_functions')
from summarize_dataset_with_hubness import summarize_dataset_with_hubness


In [6]:
path = "/repos/smote_msfb/public_datasets/bibtex/"

In [None]:
# Load the .arff file - With full data in one file
with open(path + 'bibtex.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
bibtex_data = pd.DataFrame(data, columns=columns)

In [None]:
print("Shape of the bibtex_data file :", bibtex_data.shape)

In [None]:
### Get the position of the first Label columns
col_position = bibtex_data.columns.get_loc("TAG_2005")
print(col_position)

In [None]:
bibtex_data.columns

In [None]:
bibtex_data.iloc[:,:1836].columns

In [None]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(bibtex_data.iloc[:, :1836].values)
print(unique_values)

In [None]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(bibtex_data.iloc[:,1837:].values)
print(unique_values)

In [None]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = bibtex_data.iloc[:, :1836]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
bibtex_data_cleaned = pd.concat([non_constant_cols, bibtex_data.iloc[:, 1836:]], axis=1)

print("Shape of bibtex_data_cleaned :", bibtex_data_cleaned.shape)

In [None]:
## No columns were deleted during constant value checks

In [None]:
bibtex_data_cleaned.iloc[:,:1842].columns

In [None]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(1836)]

# Assign the new names to the first 1836 columns
bibtex_data_cleaned.columns.values[:1836] = new_feature_names

In [None]:
bibtex_data_cleaned.head(3)

In [None]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = bibtex_data_cleaned.iloc[:, 1836:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

In [None]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = bibtex_data_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - 1836
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
bibtex_data_cleaned.columns.values[1836:] = new_target_names

In [None]:
bibtex_data_cleaned.iloc[:, 1835:].head(3)

In [None]:
## Reset multi index before storing the data
bibtex_data_cleaned.columns = [str(col) for col in bibtex_data_cleaned.columns]

In [None]:
# Save in Parquet format (using pyarrow)
bibtex_data_cleaned.to_parquet( path + "processed_dataset.parquet", compression="snappy", engine="pyarrow", index=False)

In [None]:
############# Create the summary metrics for this dataset #####################

In [8]:
## Create summary metric dataset for this public dataset
data = pd.read_parquet( path + "processed_dataset.parquet" , engine="pyarrow")

In [None]:
summary_df = summarize_dataset_with_hubness(data, k=5)



In [None]:
summary_df.head(3)