In [1]:
! pip install liac-arff

Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25ldone
[?25h  Created wheel for liac-arff: filename=liac_arff-2.5.0-py3-none-any.whl size=11731 sha256=678ceac815a25266fcf53a5b0ffe78bccd087589bc21ce75bc7af8d0bfae12d6
  Stored in directory: /home/ubuntu/.cache/pip/wheels/a2/de/68/bf3972de3ecb31e32bef59a7f4c75f0687a3674c476b347c14
Successfully built liac-arff
Installing collected packages: liac-arff
Successfully installed liac-arff-2.5.0


In [47]:
! pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (40.0 MB)
[K     |████████████████████████████████| 40.0 MB 5.3 MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully installed pyarrow-17.0.0


In [2]:
## Checks to be executed for each dataset
## 1. The unique values across the entire dataset should be 0 or 1 only
## 2. There should NOT be any columns in the co-variate space with all value equal. Let's remove those columns

In [3]:
import arff
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

In [4]:
path = "/repos/smote_msfb/public_datasets/enron/"

In [5]:
# Load the .arff file - With full data in one file
with open(path + 'enron.arff', 'r') as f:
    dataset = arff.load(f)
    
# Extract data and attributes
data = dataset['data']
attributes = dataset['attributes']

# Create DataFrame
columns = [attr[0] for attr in attributes]
df = pd.DataFrame(data, columns=columns)

In [6]:
print("Shape of the delicious.arff file :", df.shape)

Shape of the delicious.arff file : (1702, 1054)


In [7]:
### Get the position of the first Label columns
col_position = df.columns.get_loc("A.A8")  ## The first label column in the dataset
print(col_position)

1001


In [8]:
df.columns

Index(['0', '00', '000', '01', '02', '03', '04', '05', '06', '07',
       ...
       'C.C3', 'D.D10', 'D.D18', 'B.B13', 'D.D17', 'B.B10', 'C.C1', 'D.D4',
       'C.C13', 'D.D14'],
      dtype='object', length=1054)

In [9]:
df.iloc[:,:col_position].columns

Index(['0', '00', '000', '01', '02', '03', '04', '05', '06', '07',
       ...
       'workers', 'working', 'world', 'writer', 'writers', 'www', 'year',
       'years', 'yesterday', 'york'],
      dtype='object', length=1001)

In [10]:
##Check if the unique values across the entire co-variate space is 0 and 1 only or not
unique_values = np.unique(df.iloc[:, :col_position].values)
print(unique_values)

['0' '1']


In [11]:
## Check if the unique values in the mulit labels is 0 and 1 only or not
unique_values = np.unique(df.iloc[:,col_position:].values)
print(unique_values)

['0' '1']


In [12]:
## Check to drop columns from the dataset with same values for all rows

# Subset the first 1836 columns
features = df.iloc[:, :col_position]

# Identify columns with more than one unique value
non_constant_cols = features.loc[:, features.nunique(dropna=False) > 1]

print("Shape of the non_constant_cols :", non_constant_cols.shape)

# Concatenate with the remaining part of the DataFrame (e.g., label columns)
df_cleaned = pd.concat([non_constant_cols, df.iloc[:, col_position:]], axis=1)

print("Shape of df_cleaned :", df_cleaned.shape)

Shape of the non_constant_cols : (1702, 1001)
Shape of df_cleaned : (1702, 1054)


In [13]:
## No columns were deleted during constant value checks

In [14]:
df.iloc[:,:col_position].columns

Index(['0', '00', '000', '01', '02', '03', '04', '05', '06', '07',
       ...
       'workers', 'working', 'world', 'writer', 'writers', 'www', 'year',
       'years', 'yesterday', 'york'],
      dtype='object', length=1001)

In [15]:
##  Rename the co-variate feature space of the dataset

# Generate new column names
new_feature_names = [f"f_{i}" for i in range(col_position)]

# Assign the new names to the first 1836 columns
df_cleaned.columns.values[:col_position] = new_feature_names

In [16]:
df_cleaned.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,C.C3,D.D10,D.D18,B.B13,D.D17,B.B10,C.C1,D.D4,C.C13,D.D14
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df_cleaned = df_cleaned.astype(int)

In [39]:
## Dataset is highly imbalanced

# Slice only the label columns
label_data = df_cleaned.iloc[:, col_position:]

# Calculate proportions of 0's and 1's for each column
for col in label_data.columns:
    value_counts = label_data[col].value_counts(normalize=True).sort_index()
    prop_0 = value_counts.get(0, 0)
    prop_1 = value_counts.get(1, 0)
    print(f"{col}: 0's = {prop_0:.4f}, 1's = {prop_1:.4f}")

target_1: 0's = 0.9847, 1's = 0.0153
target_2: 0's = 0.9624, 1's = 0.0376
target_3: 0's = 0.9965, 1's = 0.0035
target_4: 0's = 0.9847, 1's = 0.0153
target_5: 0's = 0.9365, 1's = 0.0635
target_6: 0's = 0.9512, 1's = 0.0488
target_7: 0's = 0.4636, 1's = 0.5364
target_8: 0's = 0.9753, 1's = 0.0247
target_9: 0's = 0.9941, 1's = 0.0059
target_10: 0's = 0.9853, 1's = 0.0147
target_11: 0's = 0.9929, 1's = 0.0071
target_12: 0's = 0.6868, 1's = 0.3132
target_13: 0's = 0.9266, 1's = 0.0734
target_14: 0's = 0.9031, 1's = 0.0969
target_15: 0's = 0.4976, 1's = 0.5024
target_16: 0's = 0.9894, 1's = 0.0106
target_17: 0's = 0.9947, 1's = 0.0053
target_18: 0's = 0.9871, 1's = 0.0129
target_19: 0's = 0.9777, 1's = 0.0223
target_20: 0's = 0.9877, 1's = 0.0123
target_21: 0's = 0.9371, 1's = 0.0629
target_22: 0's = 0.8966, 1's = 0.1034
target_23: 0's = 0.9677, 1's = 0.0323
target_24: 0's = 0.9436, 1's = 0.0564
target_25: 0's = 0.9548, 1's = 0.0452
target_26: 0's = 0.6005, 1's = 0.3995
target_27: 0's = 0.99

In [40]:
## Rename all the response variables as target_X

# Total number of columns
total_cols = df_cleaned.shape[1]

# Generate new names for response variables
num_targets = total_cols - col_position
new_target_names = [f"target_{i+1}" for i in range(num_targets)]

# Apply the new names
df_cleaned.columns.values[col_position:] = new_target_names

In [41]:
df_cleaned.columns = df_cleaned.columns.map(lambda x: "_".join(map(str, x)) if isinstance(x, tuple) else str(x))

In [42]:
## Meaning the no. of minority > no. of majority samples. When ever this happens we need to --> Random convert minority samples to 
## majority by flipping the response variable to 0 from 1. Such that the no. of minority is 25% of the majority samples.

for target_col in [c for c in df_cleaned.columns if c.startswith("target_")]:
    counts = df_cleaned[target_col].value_counts()
    num_majority = counts.get(0, 0)
    num_minority = counts.get(1, 0)

    if num_minority > num_majority:
        # Pre ratio
        pre_ratio = num_minority / num_majority if num_majority > 0 else float('inf')

        # Desired minority count = 25% of majority
        desired_minority = int(0.25 * num_majority)

        # Number of flips needed
        num_to_flip = num_minority - desired_minority
        minority_indices = df_cleaned[df_cleaned[target_col] == 1].sample(
            n=num_to_flip, random_state=42
        ).index

        # Flip selected 1's to 0
        df_cleaned.loc[minority_indices, target_col] = 0

        # Post ratio
        counts_post = df_cleaned[target_col].value_counts()
        post_ratio = counts_post.get(1, 0) / counts_post.get(0, 1)

        print(f"Adjusted {target_col}: pre ratio={pre_ratio:.2f}, post ratio={post_ratio:.2f}")


Adjusted target_7: pre ratio=1.16, post ratio=0.13
Adjusted target_15: pre ratio=1.01, post ratio=0.14


In [43]:
# Collect counts for all target_* columns
counts_summary = {}

for col in df_cleaned.columns:
    if col.startswith("target_"):
        counts_summary[col] = df_cleaned[col].value_counts().to_dict()

# Convert to DataFrame for easy viewing
summary_df = pd.DataFrame(counts_summary).T.fillna(0).astype(int)
summary_df.columns = ["0_count", "1_count"]  # reorder if needed

print(summary_df)

           0_count  1_count
target_1      1676       26
target_2      1638       64
target_3      1696        6
target_4      1676       26
target_5      1594      108
target_6      1619       83
target_7      1505      197
target_8      1660       42
target_9      1692       10
target_10     1677       25
target_11     1690       12
target_12     1169      533
target_13     1577      125
target_14     1537      165
target_15     1491      211
target_16     1684       18
target_17     1693        9
target_18     1680       22
target_19     1664       38
target_20     1681       21
target_21     1595      107
target_22     1526      176
target_23     1647       55
target_24     1606       96
target_25     1625       77
target_26     1022      680
target_27     1689       13
target_28     1695        7
target_29     1682       20
target_30     1578      124
target_31     1699        3
target_32     1639       63
target_33     1695        7
target_34     1677       25
target_35     1680  

In [44]:
df_cleaned.to_csv(path + "/Final_dataset.csv", index=False)

In [45]:
df_cleaned.columns = [str(col) for col in df_cleaned.columns]

In [48]:
# Save as Parquet (compressed with snappy)
df_cleaned.to_parquet(
    path + "/processed_dataset.parquet", 
    engine="pyarrow", 
    compression="snappy", 
    index=False
)