In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [6]:
def split_large_dataset(filepath, output_dir, output_prefix="split", chunk_size=10000):
    """
    Split a large dataset into smaller chunks and save them as separate CSV files.

    Args:
    - filepath (str): Path to the large dataset.
    - output_dir (str): Directory where the split datasets will be saved.
    - output_prefix (str): Prefix for the output files.
    - chunk_size (int): Number of rows per small dataset.

    Returns:
    - List of filepaths of the saved datasets.
    """

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read the large dataset in chunks
    chunked_data = pd.read_csv(filepath, chunksize=chunk_size)

    output_files = []
    chunk_num = 1
    for chunk in chunked_data:
        # Construct the filename for each chunk
        output_filename = os.path.join(output_dir, f"{output_prefix}_{chunk_num}.csv")
        # Save the current chunk to a CSV file
        chunk.to_csv(output_filename, index=False)
        # Add the filepath to the list of output files
        output_files.append(output_filename)
        chunk_num += 1

    return output_files

input_filepath = "Datasets/27072022.csv"
output_directory = "Datasets/EditDatasets"

# Split and save
output_files = split_large_dataset(input_filepath, output_directory)

print("Files saved:")
for file in output_files:
    print(file)

Files saved:
Datasets/EditDatasets\split_1.csv
Datasets/EditDatasets\split_2.csv
Datasets/EditDatasets\split_3.csv
Datasets/EditDatasets\split_4.csv
Datasets/EditDatasets\split_5.csv
Datasets/EditDatasets\split_6.csv
Datasets/EditDatasets\split_7.csv
Datasets/EditDatasets\split_8.csv
Datasets/EditDatasets\split_9.csv
Datasets/EditDatasets\split_10.csv
Datasets/EditDatasets\split_11.csv
Datasets/EditDatasets\split_12.csv
Datasets/EditDatasets\split_13.csv
Datasets/EditDatasets\split_14.csv
Datasets/EditDatasets\split_15.csv
Datasets/EditDatasets\split_16.csv
Datasets/EditDatasets\split_17.csv
Datasets/EditDatasets\split_18.csv
Datasets/EditDatasets\split_19.csv
Datasets/EditDatasets\split_20.csv
Datasets/EditDatasets\split_21.csv
Datasets/EditDatasets\split_22.csv
Datasets/EditDatasets\split_23.csv
Datasets/EditDatasets\split_24.csv
Datasets/EditDatasets\split_25.csv
Datasets/EditDatasets\split_26.csv
Datasets/EditDatasets\split_27.csv
Datasets/EditDatasets\split_28.csv
Datasets/EditDat