In [8]:
import pandas as pd
import glob
import tensorflow as tf

# Define the path where your CSV files are located
csv_files_path = '../csv_files/*.csv'

# Load and concatenate all CSV files
all_data = pd.concat((pd.read_csv(file) for file in glob.glob(csv_files_path)), ignore_index=True)

# Randomly sample 1% of the data
sampled_data = all_data.sample(frac=0.01, random_state=42)

# Save the 10% sample to a CSV file to reuse for consistent comparisons
sampled_data.to_csv('sampled_data_10_percent.csv', index=False)

# Convert to TensorFlow datasets and split into train and test sets
sampled_data = tf.data.Dataset.from_tensor_slices(dict(sampled_data))

# Define the split ratio (80% train, 20% test)
train_size = int(0.8 * len(sampled_data))
train_data = sampled_data.take(train_size)
test_data = sampled_data.skip(train_size)

# Print dataset sizes to confirm
print(f"Total data size: {len(all_data)}")
print(f"Sampled data size (10%): {len(sampled_data)}")
print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")


Total data size: 7999973
Sampled data size (10%): 80000
Training set size: 64000
Test set size: 16000


In [6]:
import os
print("Current working directory:", os.getcwd())


Current working directory: /home/diego2/fem/Accelerated Contact Detection
