## Simple Partition

In [1]:
import pandas as pd
import numpy as np

In [7]:
# Read the CSV file
df = pd.read_csv("../Data/labelling-round_1.csv")
print(f"Original data has {len(df)} rows.")

Original data has 100 rows.


In [8]:
# Prepare list containers for splits (one list per file)
split_dfs = [[] for _ in range(4)]

# Process each label group separately
for label, group in df.groupby("roberta_label"):
    # Shuffle the rows within the group for randomness (set random_state for reproducibility)
    group_shuffled = group.sample(frac=1, random_state=42)
    # Split the group into 4 parts as evenly as possible
    group_splits = np.array_split(group_shuffled, 4)
    
    # Append each split to its corresponding container
    for i, split in enumerate(group_splits):
        split_dfs[i].append(split)

# Concatenate the splits for each of the 4 sets
final_splits = [pd.concat(parts) for parts in split_dfs]

# (Optional) Shuffle each final split to mix rows from different labels
final_splits = [split.sample(frac=1, random_state=42).reset_index(drop=True) for split in final_splits]

  return bound(*args, **kwds)


In [9]:
# Verify that all rows are included
total_rows = sum(len(split) for split in final_splits)
assert total_rows == len(df), "Row count mismatch after splitting!"
print("Stratified splitting complete. Each split maintains the original distribution.")

Stratified splitting complete. Each split maintains the original distribution.


In [10]:
output_files = [
    "labelling-round_1_JJ.csv",
    "labelling-round_1_AG.csv",
    "labelling-round_1_AJ.csv",
    "labelling-round_1_ST.csv"
]

# Save each split to its corresponding CSV file without modifying the content/fields
for file, split_df in zip(output_files, final_splits):
    split_df.to_csv(file, index=False)
    print(f"Saved {len(split_df)} rows to {file}")

print("All files saved successfully!")

Saved 26 rows to labelling-round_1_JJ.csv
Saved 26 rows to labelling-round_1_AG.csv
Saved 25 rows to labelling-round_1_AJ.csv
Saved 23 rows to labelling-round_1_ST.csv
All files saved successfully!


## Whole Balanced Partition

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../Data/labelled_data_1.csv")

In [2]:
# Filter out rows where similarity <= 0
df_filtered = df[df['similarity'] > 0]

# Print the number of records left
print(f"Number of records after filtering: {len(df_filtered)}")

Number of records after filtering: 4712


In [3]:
# Print the unique classes in 'label_1'
print("Unique classes in label_1:", df_filtered['label_1'].unique())

Unique classes in label_1: ['negative' 'neutral' 'positive']


In [4]:
# Ensure we have at least 200 samples per class
class_counts = df_filtered['label_1'].value_counts()
print("Class counts after filtering:\n", class_counts)

Class counts after filtering:
 label_1
neutral     2169
negative    1741
positive     802
Name: count, dtype: int64


In [5]:
# Sample 200 from each class
df_balanced = df_filtered.groupby('label_1').apply(lambda x: x.sample(n=200, random_state=42)).reset_index(drop=True)

  df_balanced = df_filtered.groupby('label_1').apply(lambda x: x.sample(n=200, random_state=42)).reset_index(drop=True)


In [6]:
# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into four equal sets of 150 records each (50 per class)
dfs = []
for i in range(4):
    df_subset = df_balanced.groupby('label_1').apply(lambda x: x.iloc[i*50:(i+1)*50]).reset_index(drop=True)
    dfs.append(df_subset)

  df_subset = df_balanced.groupby('label_1').apply(lambda x: x.iloc[i*50:(i+1)*50]).reset_index(drop=True)
  df_subset = df_balanced.groupby('label_1').apply(lambda x: x.iloc[i*50:(i+1)*50]).reset_index(drop=True)
  df_subset = df_balanced.groupby('label_1').apply(lambda x: x.iloc[i*50:(i+1)*50]).reset_index(drop=True)
  df_subset = df_balanced.groupby('label_1').apply(lambda x: x.iloc[i*50:(i+1)*50]).reset_index(drop=True)


In [7]:
# Define file names
file_names = ["../Data/manual_labelSet_AM.csv", "../Data/manual_labelSet_JJ.csv", "../Data/manual_labelSet_ST.csv", "../Data/manual_labelSet_AG.csv"]

# Save each subset as a CSV file
for df_part, name in zip(dfs, file_names):
    df_part.to_csv(name, index=False)
    print(f"Saved {name} with {len(df_part)} records.")

print("Processing complete!")

Saved ../Data/manual_labelSet_AM.csv with 150 records.
Saved ../Data/manual_labelSet_JJ.csv with 150 records.
Saved ../Data/manual_labelSet_ST.csv with 150 records.
Saved ../Data/manual_labelSet_AG.csv with 150 records.
Processing complete!
