In [1]:
import os
import h5py
import json

# Define paths
output_dir = "/home/aleksia/lex_sum_data"  # Update to your actual output directory
split = "train"
example_idx = "903"  # Replace with the actual index of the example to inspect 903

# Paths for HDF5 embeddings and JSON labels
h5_path = os.path.join(output_dir, "data", split, f"{example_idx}.h5")
labels_path = os.path.join(output_dir, "labels", split, f"{example_idx}.json")

# Check if files exist
if not os.path.exists(h5_path):
    print(f"HDF5 file not found: {h5_path}")
else:
    # Open and display contents of the HDF5 file
    with h5py.File(h5_path, "r") as h5_file:
        print("HDF5 File Contents:")
        for dataset_name in h5_file:
            print(f"Dataset: {dataset_name}")
            print(f"Shape: {h5_file[dataset_name].shape}")
            print(f"Data (first 5 rows):\n{h5_file[dataset_name][:5]}")
            print("-" * 50)

if not os.path.exists(labels_path):
    print(f"JSON labels file not found: {labels_path}")
else:
    # Open and display contents of the JSON labels file
    with open(labels_path, "r") as f:
        labels_data = json.load(f)
    print("\nJSON Labels File Contents:")
    print(json.dumps(labels_data, indent=4))


HDF5 File Contents:
Dataset: source_0
Shape: (26, 768)
Data (first 5 rows):
[[-0.2049471   0.38836104  0.23387763 ... -0.08640356  0.06975134
   0.01288433]
 [-0.32270592  0.37768143  0.16106208 ... -0.08169508  0.07691093
   0.00725589]
 [-0.52412736  0.36452696  0.0812808  ... -0.2704392   0.19025093
  -0.21307722]
 [-0.43398756  0.37485993  0.10327204 ... -0.23631793  0.20257697
  -0.18480147]
 [-0.40260494  0.40734616  0.14345679 ... -0.26738396  0.19298941
  -0.17155594]]
--------------------------------------------------
Dataset: source_1
Shape: (87, 768)
Data (first 5 rows):
[[-0.19593905  0.22152603  0.24580285 ... -0.0783961   0.04808614
  -0.03643068]
 [-0.04790307 -0.10198247  0.29597938 ... -0.06362463 -0.00315236
  -0.087108  ]
 [-0.17410286  0.07787022  0.29758635 ... -0.09342032  0.03741929
   0.09231095]
 [-0.10867149  0.10004477  0.16838448 ... -0.02606332  0.01739989
  -0.05112527]
 [-0.06972384  0.12471316  0.14667866 ...  0.00514695  0.04087734
   0.0928589 ]]
-----

In [14]:
import os
import json

# Path to your tokenized dataset
tokenized_dataset_dir = "/home/aleksia/tokenized_dataset"
split = "train"  # Example split
example_id = "903"  # Example ID of a tokenized file

# Construct the file path
file_path = os.path.join(tokenized_dataset_dir, split, f"{example_id}.json")

# Load the tokenized JSON file
if os.path.exists(file_path):
    with open(file_path, "r") as file:
        tokenized_data = json.load(file)
    
    # Inspect the data
    print(tokenized_data.get('long').keys())
else:
    print(f"File {file_path} does not exist.")


dict_keys(['text', 'chunk_0', 'chunk_1', 'chunk_2'])


In [11]:
import os
import json
from tqdm import tqdm

# Path to the labels directory
labels_dir = "/home/aleksia/lex_sum_data/labels"  # Replace with your labels directory
split = "validation"  # Change to "validation" or "test" if needed

# Full path to the split's labels directory
split_labels_dir = os.path.join(labels_dir, split)

# Initialize counters
total_entries = 0
entries_with_multiple_chunks = 0
max_chunks = 0

# Iterate through all JSON files in the directory
for filename in tqdm(os.listdir(split_labels_dir), desc=f"Processing {split} split"):
    if filename.endswith(".json"):
        total_entries += 1
        file_path = os.path.join(split_labels_dir, filename)

        # Load the JSON file
        with open(file_path, "r") as f:
            data = json.load(f)

        # Check the "long" summary for chunks
        long_data = data.get("long", {})
        if long_data and "chunks" in long_data:
            num_chunks = len(long_data["chunks"])
            if num_chunks > 1:
                entries_with_multiple_chunks += 1
            if num_chunks > max_chunks:
                max_chunks = num_chunks

# Print results
print(f"Total entries in the {split} split: {total_entries}")
print(f"Entries with more than one chunk in the {split} split: {entries_with_multiple_chunks}")
print(f"Max number of chunks per entry: {max_chunks}")


Processing validation split: 100%|██████████| 187/187 [00:00<00:00, 2117.61it/s]

Total entries in the validation split: 187
Entries with more than one chunk in the validation split: 105
Max number of chunks per entry: 9





In [15]:
import os
import json
from tqdm import tqdm

# Define directories
tokenized_dir = "/home/aleksia/tokenized_dataset"  # Replace with your path
log_file = "/home/aleksia/SIMPLe/removed_entries.json"  # Replace with your path

removed_entries = []

# Process each split (train, validation, test)
for split in ["train", "validation", "test"]:
    split_dir = os.path.join(tokenized_dir, split)

    # Process files in the split directory
    for filename in tqdm(os.listdir(split_dir), desc=f"Processing {split} split"):
        if not filename.endswith(".json"):
            continue

        file_path = os.path.join(split_dir, filename)

        with open(file_path, "r") as f:
            data = json.load(f)

        # Check if long summary exists and has more than one chunk
        if "long" in data and any(key.startswith("chunk_") for key in data["long"]):
            chunk_keys = [key for key in data["long"].keys() if key.startswith("chunk_")]
            if len(chunk_keys) > 1:  # More than one chunk
                removed_entries.append({"file": filename, "num_chunks": len(chunk_keys)})
                os.remove(file_path)  # Remove the file
                continue

# Save removed entries log
with open(log_file, "w") as f:
    json.dump(removed_entries, f, indent=4)

print(f"Removed entries saved to {log_file}")
print("In-place filtering complete.")


Processing train split: 100%|██████████| 3177/3177 [01:17<00:00, 41.22it/s]
Processing validation split: 100%|██████████| 454/454 [00:12<00:00, 36.65it/s]
Processing test split: 100%|██████████| 908/908 [00:19<00:00, 47.06it/s]

Removed entries saved to /home/aleksia/SIMPLe/removed_entries.json
In-place filtering complete.





In [20]:
import os
import json
from tqdm import tqdm

# Configuration
lex_sum_data_dir = "/home/aleksia/lex_sum_data"  # Replace with your path
split = "train"  # Specify the split (train, validation, test)
log_file = f"/home/aleksia/SIMPLe/removed_{split}_entries.json"  # Log file for removed entries

# Paths for data and labels
data_dir = os.path.join(lex_sum_data_dir, "data", split)
labels_dir = os.path.join(lex_sum_data_dir, "labels", split)

# Create log for removed entries
removed_entries = []

# Process each label file in the split
for filename in tqdm(os.listdir(labels_dir), desc=f"Processing {split} split"):
    if not filename.endswith(".json"):
        continue

    label_path = os.path.join(labels_dir, filename)
    data_path = os.path.join(data_dir, filename.replace(".json", ".h5"))

    with open(label_path, "r") as f:
        labels = json.load(f)

    # Check if 'long' exists and has multiple chunks
    if "long" in labels and "chunks" in labels["long"]:
        chunk_keys = [key for key in labels["long"]["chunks"].keys()]
        if len(chunk_keys) > 1:  # More than one chunk
            # Log the entry and remove both data and labels
            removed_entries.append({"file": filename, "num_chunks": len(chunk_keys)})
            if os.path.exists(label_path):
                os.remove(label_path)
            if os.path.exists(data_path):
                os.remove(data_path)

# Count remaining data points
remaining_files = [f for f in os.listdir(labels_dir) if f.endswith(".json")]

# Save removed entries log
with open(log_file, "w") as f:
    json.dump(removed_entries, f, indent=4)

print(f"Removed entries saved to {log_file}")
print("In-place filtering complete.")
print(f"Remaining data points in {split} split: {len(remaining_files)}")


Processing train split: 100%|██████████| 1484/1484 [00:00<00:00, 4249.87it/s]

Removed entries saved to /home/aleksia/SIMPLe/removed_train_entries.json
In-place filtering complete.
Remaining data points in train split: 1484



