In [3]:
import requests

In [7]:
# Define file paths
input_path = 'kosarak.dat'     # Path to your input dataset file
output_path = 'sparse_data.arff'  # Path to save the ARFF file

# Read the dataset from the korsak.dat file
with open(input_path, 'r') as file:
    lines = file.readlines()

# Convert the dataset into a list of rows of integers
dataset = [list(map(int, line.split())) for line in lines if line.strip()]

# Convert the dataset into sparse ARFF format strings
lines_arff = []
lines_arff.append('@relation sparse_data')

# Define the attributes (assuming a large number of numeric attributes)
num_attributes = 150  # Number of potential numeric attributes
for i in range(1, num_attributes + 1):
    lines_arff.append(f'@attribute attr{i} numeric')

lines_arff.append('@data')

# For each row in the dataset
for row in dataset:
    # Represent each row in sparse format as index-value pairs
    sparse_row = [f"{index + 1}:{value}" for index, value in enumerate(row)]
    lines_arff.append(' '.join(sparse_row))

# Save these lines to the ARFF file
with open(output_path, 'w') as arff_file:
    for line in lines_arff:
        arff_file.write(line + '\n')

print(f"Sparse ARFF file created at {output_path}")


Sparse ARFF file created at sparse_data.arff


In [1]:
import time

In [5]:
def read_input_file(input_path):
    """Reads the input file and returns a list of transactions."""
    with open(input_path, 'r') as file:
        return [line.strip().split() for line in file]

def create_sparse_arff(transactions, output_path):
    """Creates a sparse ARFF file from transactions."""
    # Collect unique items
    items = sorted(set(item for transaction in transactions for item in transaction))
    item_to_index = {item: idx for idx, item in enumerate(items)}

    # ARFF Header
    with open(output_path, 'w') as arff_file:
        arff_file.write("@RELATION kosarak\n\n")
        for item in items:
            arff_file.write(f"@ATTRIBUTE {item} {{0, 1}}\n")
        arff_file.write("\n@DATA\n")

        # Write data in sparse format
        for transaction in transactions:
            indices = [item_to_index[item] for item in transaction if item in item_to_index]
            sparse_representation = "{" + ",".join(f"{idx} 1" for idx in sorted(indices)) + "}"
            arff_file.write(sparse_representation + "\n")

def convert_to_sparse_arff(input_path, output_path):
    """Converts a transaction dataset to a sparse ARFF file."""
    transactions = read_input_file(input_path)
    create_sparse_arff(transactions, output_path)

# Example usage
input_file = "kosarak.dat"
output_file = "kosarak.arff"
convert_to_sparse_arff(input_file, output_file)


In [4]:
input_file = "kosarak.dat"
output_file = "kosarak.arff"
convert_to_sparse_arff(input_file, output_file)

KeyboardInterrupt: 