<a href="https://colab.research.google.com/github/avkornaev/Sleep_Stages/blob/main/Preproc_file_saver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import os
import pandas as pd

def convert_xlsx_to_csv(directory):
    """
    Converts all .xlsx files in the specified directory to .csv files.
    - Removes the first column.
    - Renames the last column to 'label'.
    - Converts all columns except the last one to float32.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".xlsx"):
            # Construct full file path
            file_path = os.path.join(directory, filename)

            # Read the Excel file
            df = pd.read_excel(file_path)

            # Print original file size
            original_size = os.path.getsize(file_path) / 1024  # Size in KB
            print(f"\nProcessing file: {filename}")
            print(f"Original file size: {original_size:.2f} KB")

            # Remove the first column
            df = df.drop(df.columns[0], axis=1)

            # Rename the last column to 'label'
            df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

            # Convert all columns except the last one to float32
            for col in df.columns[:-1]:  # Exclude the last column
                df[col] = df[col].astype('float32')

            # Ensure the last column is treated as a string
            df['label'] = df['label'].astype(str)

            # Save as CSV
            csv_filename = filename.replace(".xlsx", ".csv")
            csv_path = os.path.join(directory, csv_filename)
            df.to_csv(csv_path, index=False)

            # Print converted file size
            converted_size = os.path.getsize(csv_path) / 1024  # Size in KB
            print(f"Converted file size: {converted_size:.2f} KB")

            # Display the first few rows of the DataFrame
            print("\nSample data (first 5 rows):")
            print(df.head())

            print(f"Converted '{filename}' to '{csv_filename}'")


def compress_csv_files(directory):
    """
    Compresses all CSV files in the specified directory using gzip.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Construct full file path
            file_path = os.path.join(directory, filename)

            # Read the CSV file
            df = pd.read_csv(file_path)

            # Print original file size
            original_size = os.path.getsize(file_path) / 1024  # Size in KB
            print(f"\nProcessing file: {filename}")
            print(f"Original file size: {original_size:.2f} KB")

            # Save as gzip-compressed CSV
            compressed_filename = filename + ".gz"
            compressed_path = os.path.join(directory, compressed_filename)
            df.to_csv(compressed_path, index=False, compression='gzip')

            # Print compressed file size
            compressed_size = os.path.getsize(compressed_path) / 1024  # Size in KB
            print(f"Compressed file size: {compressed_size:.2f} KB")

            print(f"Compressed '{filename}' to '{compressed_filename}'")


In [8]:
# Example usage
directory = "/content/Data"  # Replace with your directory path
convert_xlsx_to_csv(directory)


Processing file: Vol_02.xlsx
Original file size: 9731.33 KB
Converted file size: 9963.86 KB

Sample data (first 5 rows):
   1 LDF    1 T  1 A365  1 A460    1ANADH  2 LDF  2 T  2 A365  2 A460  2ANADH  \
0    0.0  34.41   115.0    47.0  0.408696    NaN  NaN     NaN     NaN     NaN   
1    0.0  34.41   115.0    47.0  0.408696    NaN  NaN     NaN     NaN     NaN   
2    0.0  34.41   115.0    47.0  0.408696    NaN  NaN     NaN     NaN     NaN   
3    0.0  34.41   115.0    47.0  0.408696    NaN  NaN     NaN     NaN     NaN   
4    3.4  34.41   115.0    47.0  0.408696    NaN  NaN     NaN     NaN     NaN   

   3 LDF    3 T  3 A365  3 A460  3ANADH  4 LDF  4 T  4 A365  4 A460 label  
0    5.3  35.68   125.0    81.0   0.648    0.0  0.0     0.0     0.0     R  
1    5.3  35.68   125.0    81.0   0.648    0.0  0.0     0.0     0.0     R  
2    5.3  35.68   125.0    81.0   0.648    0.0  0.0     0.0     0.0     R  
3    5.3  35.68   125.0    81.0   0.648    0.0  0.0     0.0     0.0     R  
4    5.4  3

In [9]:
compress_csv_files(directory)


Processing file: Vol_02.csv
Original file size: 9963.86 KB
Compressed file size: 308.85 KB
Compressed 'Vol_02.csv' to 'Vol_02.csv.gz'

Processing file: Vol_03_2.csv
Original file size: 41021.08 KB
Compressed file size: 1563.20 KB
Compressed 'Vol_03_2.csv' to 'Vol_03_2.csv.gz'

Processing file: Vol_01.csv
Original file size: 26029.02 KB
Compressed file size: 825.19 KB
Compressed 'Vol_01.csv' to 'Vol_01.csv.gz'

Processing file: Vol_03_1.csv
Original file size: 47313.65 KB
Compressed file size: 1667.85 KB
Compressed 'Vol_03_1.csv' to 'Vol_03_1.csv.gz'

Processing file: Vol_03_3.csv
Original file size: 45176.18 KB
Compressed file size: 1746.27 KB
Compressed 'Vol_03_3.csv' to 'Vol_03_3.csv.gz'
