# Preprocessing for seizure prediction data

## Making 5 columns on the basis of sampling freqency of the dataset

In [None]:
import pandas as pd
from tqdm import tqdm

def process_input_file(input_file, output_file, window_size=5):
    # Read the input file
    df = pd.read_csv(input_file)

    # Create a dictionary to store lists of values for each column
    output_data = {}

    # Initialize the tqdm progress bar
    total_iterations = len(df) - window_size + 1
    pbar = tqdm(total=total_iterations, desc='Processing')

    # Iterate over each rolling window of 5 rows
    for i in range(len(df) - window_size + 1):
        # Iterate over each input column
        for col in df.columns:
            # Extract values from each row in the window
            values = df[col].iloc[i:i+window_size].tolist()

            # Create new columns in the output dictionary
            for j, value in enumerate(values, start=1):
                col_name = f'{col}_{j}'
                if col_name not in output_data:
                    output_data[col_name] = []
                output_data[col_name].append(value)

        # Update the progress bar
        pbar.update(1)

    # Convert the dictionary to a DataFrame
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to a new CSV file
    output_df.to_csv(output_file, index=False)

    # Close the progress bar
    pbar.close()

# Provide the input and output file names
input_file = 'C:/Users/USER/Desktop/4th_sem/DataSet/chbmit_ictal_23channels_data.csv'  # Replace with your input file path
output_file = 'C:/Users/USER/Desktop/4th_sem/Processed_DataSet_1/chbmit_ictal_23channels_data_making_5_columns.csv'  # Replace with your desired output file path

# Call the function to process the input file and generate the output file
process_input_file(input_file, output_file)

print(f"Processing complete. Output saved to {output_file}")


## Multipliying the value with 2^30 to so fit them our desired length when converted to binary

In [None]:
import pandas as pd

def multiply_values_by_power_and_save(file_path, output_file, power):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Multiply each value by 2^power
    result_df = df * 2**power

    # Save the result to a new CSV file
    result_df.to_csv(output_file, index=False)

# Provide the input and output file names
input_file = 'C:/Users/USER/Desktop/4th_sem/Processed_DataSet_1/chbmit_ictal_23channels_data_making_5_columns.csv'  # Replace with the path to your CSV file
output_file = 'C:/Users/USER/Desktop/4th_sem/Processed_Dataset_2/ictal_2^30.csv'  # Replace with the desired output file name

# Specify the power for multiplication (e.g., 30 for 2^30)
power = 30

# Call the function to multiply values and save to a new file
multiply_values_by_power_and_save(input_file, output_file, power)

print(f"Values multiplied and saved to {output_file}.")

## Considering the integer part of the values

In [None]:
import pandas as pd
from tqdm import tqdm

def extract_integer_part(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Initialize the progress bar
    total_cells = df.size
    progress_bar = tqdm(total=total_cells, desc='Processing')

    # Extract the integer part from each value
    df_integer = df.applymap(lambda x: int(x) if pd.notna(x) else x)
    progress_bar.update(total_cells)

    # Save the DataFrame with only integer parts to a new CSV file
    df_integer.to_csv(output_file, index=False)

    # Close the progress bar
    progress_bar.close()

# Provide the file path for the input CSV file and the desired output file
input_file = 'C:/Users/USER/Desktop/4th_sem/Processed_Dataset_2/ictal_2^30.csv'  # Replace with the path to your CSV file
output_file = 'C:/Users/USER/Desktop/4th_sem/Processed_dataset_3/ictal_integer.csv'  # Replace with the desired output file path

# Call the function to extract integer parts and create a new CSV file
extract_integer_part(input_file, output_file)

# Print a message indicating the completion of the process
print(f"Integer parts saved to: {output_file}")


## Converting them to binary representation

In [None]:
import pandas as pd
import struct
import math
from pathlib import Path
from tqdm import tqdm

def float_to_binary(value):
    # Convert a floating-point number to IEEE 754 binary representation
    if math.isnan(value):
        return '0' * 31  # Special case for NaN
    elif value == 0.0:
        return '0' * 31  # Special case for zero
    else:
        # Extract sign, exponent, and fraction parts
        sign = 0 if value >= 0 else 1
        value = abs(value)
        exponent = 15  # Set a common exponent value for all numbers
        fraction = value / (2 ** exponent)

        # Adjust the fraction to ensure a total of 22 binary digits
        fraction_bits = bin(int(fraction * (2 ** 22)))[2:].zfill(22)

        # Convert to IEEE 754 binary representation
        return str(sign) + bin(exponent)[2:].zfill(5) + fraction_bits

def convert_to_binary(input_file, output_file, rows_to_process=5000):
    df = pd.read_csv(input_file, nrows=rows_to_process)
    total_rows, total_cols = df.shape

    binary_df = pd.DataFrame()

    for row_index in tqdm(range(rows_to_process), desc='Converting', total=rows_to_process):
        for col_index in range(total_cols):
            cell_value = df.iloc[row_index, col_index]
            binary_value = float_to_binary(cell_value)

            # Append the binary representation to the DataFrame
            binary_df.at[row_index, col_index] = binary_value

    # Save the DataFrame to a new CSV file
    binary_df.to_csv(output_file, index=False)

# Provide the file path for conversion and the desired output file
input_file = 'C:/Users/USER/Desktop/4th_sem/Processed_Dataset_2/preictal_2^30.csv'  # Replace with the path to your CSV file
output_file = 'C:/Users/USER/Desktop/4th_sem/Processed_dataset_3/binary_output_3rd.csv'  # Replace with the desired output file

# Specify the number of rows to process (e.g., 5000)
rows_to_process = 5000

# Call the function to convert each cell value and save the result to a single CSV file
convert_to_binary(input_file, output_file, rows_to_process)

# Print a message indicating the completion of the process
print(f"Binary representations for the first {rows_to_process} rows saved to: {output_file}")


## Representing in 2's complement from for handling negative value

In [None]:
import csv

def convert_to_24_bit_signed(binary_str):
    """Converts a binary string to 24-bit signed representation."""
    if binary_str.startswith("-"):
        # Negative number
        binary_str = binary_str[1:]  # Remove the "-" sign
        inverted_bits = ''.join('1' if bit == '0' else '0' for bit in binary_str)
        twos_complement = bin(int(inverted_bits, 2) + 1)[2:]
        padded_binary = f"1{twos_complement:0>23}"  # Pad with "1" for negative numbers
        _binary = padded_binary[-24:]
    else:
        # Positive number
        padded_binary = f"0{binary_str:0>23}"  # Pad with "0" for positive numbers
        _binary = padded_binary[-24:]

    return _binary

def main():
    input_file = "C:/Users/USER/Desktop/4th_sem/Processed_Dataset_4/ictal_binary.csv"
    output_file = "C:/Users/USER/Desktop/4th_sem/Processed_Dataset_5/ictal_2s_binary.csv"

    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        header = next(reader)
        writer.writerow(header)

        for row in reader:
            converted_row = [convert_to_24_bit_signed(value) for value in row]
            writer.writerow(converted_row)

if __name__ == "__main__":
    main()


## Representing in six bit codon's

In [None]:
import pandas as pd
import os
from tqdm import tqdm

def split_24_bits(input_string):
    if len(input_string) != 24:
        raise ValueError("Input string must be 24 bits long")

    # Using list comprehension to create a list of 6-bit parts
    parts = [input_string[i:i + 6] for i in range(0, len(input_string), 6)]

    return parts

def ensure_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def process_csv(input_file, output_directory):
    # Load your CSV file into a DataFrame with dtype='object'
    df = pd.read_csv(input_file, dtype='object')

    # Create new columns based on the modified split_24_bits function
    new_columns = {}
    total_columns = len(df.columns)

    for column in tqdm(df.columns, desc="Processing Columns"):
        # Apply the modified function to each element in the column
        split_values = df[column].apply(split_24_bits)

        # Create new columns for each item in the split values list
        for i in range(4):
            new_columns[f"{column}_{i}"] = [item[i] for item in split_values]

    # Create a DataFrame with the new columns, specifying dtype='object'
    new_df = pd.DataFrame(new_columns, dtype='object')

    # Format each cell value to preserve leading zeros
    new_df = new_df.applymap(lambda x: f'{x:06}')

    # Get the input file name without extension
    input_file_name = os.path.splitext(os.path.basename(input_file))[0]

    # Customize the output file name with a different extension (e.g., '.xlsx')
    ensure_directory(output_directory)
    output_file = os.path.join(output_directory, f"{input_file_name}_split_modified.xlsx")

    # Save the modified DataFrame to a new Excel file
    new_df.to_excel(output_file, index=False, engine='xlsxwriter')

    print("Task completed successfully!")

if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/Seizer Detection/Process_Dataset_5/prectical_2s_binary_5000.csv"  # Replace with the actual input file path
    output_directory = "/content/drive/MyDrive/Seizer Detection/Process_Dataset_6"  # Specify the output directory
    process_csv(input_file, output_directory)


## Making 8-bits by adding two zeros(00). Previously used 8 bit formula to convert amino acid. For using that formula need to convert them into 8-bits

In [None]:
import csv
import os
from tqdm import tqdm

def ensure_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def pad_to_8_bits(binary_str):
    """Pad binary string to ensure it is 8 bits long."""
    return f'{binary_str:0>8}'

def convert_to_8_bits(binary_str):
    """Converts a binary string to 8-bit representation."""
    return pad_to_8_bits(binary_str + '00')  # Add two '0's at the end

def main():
    input_file = "C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_6/ictal_processed/ictal_2s_binary_split_modified.csv"
    output_file = "C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_7/ictal_processed/ictal_8_bit_codons.csv"

    ensure_directory(os.path.dirname(output_file))

    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        next(reader)  # Skip the first row containing column names
        next(reader)  # Skip the second row

        total_rows = sum(1 for _ in reader)  # Count total rows for the progress bar
        infile.seek(0)  # Reset file pointer to read from the beginning

        for row in tqdm(reader, total=total_rows, desc="Processing Rows"):
            converted_row = [convert_to_8_bits(value) for value in row]
            writer.writerow(converted_row)

    print("Task completed successfully!")

if __name__ == "__main__":
    main()


## Converting to Decimal integers for applying ammino acid formula

In [None]:
import pandas as pd
from tqdm import tqdm

# Replace 'your_file_path.csv' with the actual file path
file_path = "C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_7/ictal_processed/ictal_8_bit_codons.csv"

# Read the CSV file into a DataFrame, skipping the first row
df = pd.read_csv(file_path, dtype='object', skiprows=0)

# Function to convert binary to decimal
def binary_to_decimal(binary_str):
    return int(binary_str, 2)

# Create a new DataFrame for the decimal values
df_decimal = pd.DataFrame()

# Use tqdm for a progress bar
for column in tqdm(df.columns, desc="Converting to Decimal"):
    # Convert binary to decimal and add to the new DataFrame
    df_decimal[column] = df[column].apply(binary_to_decimal)

# Save the new DataFrame to a CSV file
output_csv_path = "C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_8/ictal_processed/ictal_integer_codons.csv"
df_decimal.to_csv(output_csv_path, index=False)

print(f"Decimal values saved to {output_csv_path}")


## Mapping to ammino acid sequence

In [None]:
import pandas as pd
from tqdm import tqdm

def prot(p):
    ch = 0
    if p == 0 or p == 4:
        ch = 1
    elif p == 12 or p == 8:
        ch = 2
    elif 16 <= p <= 28:
        ch = 3
    elif p == 48 or p == 52:
        ch = 4
    elif p in [56, 60, 44]:
        ch = 23
    elif p in [32, 36]:
        ch = 5
    elif p == 40:
        ch = 6
    elif 64 <= p <= 76:
        ch = 7
    elif 80 <= p <= 92:
        ch = 8
    elif p == 112 or p == 116:
        ch = 9
    elif p == 124 or p == 120:
        ch = 10
    elif 96 <= p <= 108:
        ch = 11
    elif p in [192, 196, 204]:
        ch = 12
    elif p == 200:
        ch = 23
    elif 208 <= p <= 220:
        ch = 13
    elif p == 240 or p == 244:
        ch = 14
    elif p in [248, 252]:
        ch = 15
    elif p == 224 or p == 228:
        ch = 16
    elif p == 232 or p == 236:
        ch = 17
    elif 128 <= p <= 140:
        ch = 18
    elif 144 <= p <= 156:
        ch = 19
    elif p == 176 or p == 180:
        ch = 20
    elif p == 184 or p == 188:
        ch = 21
    elif 160 <= p <= 172:
        ch = 22
    return ch

def map_values_and_save(input_file, output_file):
    try:
        # Read CSV file using pandas
        df = pd.read_csv(input_file)

        # Count total number of elements for progress bar
        total_elements = df.size

        # Apply the prot function to each value in the DataFrame with tqdm progress bar
        tqdm.pandas(desc="Mapping progress")
        df_mapped = df.progress_applymap(prot)

        # Save the results to a new CSV file
        df_mapped.to_csv(output_file, index=False)

        print(f"Mapping completed. Results saved to {output_file}")

    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Replace 'input.csv' and 'output.csv' with your file paths
input_file_path = "C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_8/ictal_processed/ictal_integer_codons.csv"
output_file_path = "C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_9/ictal_aminoacid/ictal_aminoacid_sequence.csv"

map_values_and_save(input_file_path, output_file_path)


## Assigning Target Labels

In [None]:
csv_file_path = 'C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_9/prectal_aminoacid/prectical_aminoacid_sequence.csv'
df = pd.read_csv(csv_file_path)
df['class'] = '0'
output_file_path = 'C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_10/prectial/prectial_data.csv'
df.to_csv(output_file_path, index=False)

print(f"Class column added and file saved at: {output_file_path}")


## Merge both preictal and ictal file

In [None]:
import sklearn
from sklearn.utils import shuffle
from tqdm.auto import tqdm

tqdm.pandas()

preictal_file_path = 'C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_10/prectial/prectial_data.csv'
ictal_file_path = 'C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_10/ictal/ictal_data.csv'
output_file_path = 'C:/Users/Administrator/Desktop/khalid/seizer Detection/Processed_Dataset_11/merge_preictal_ictal.csv'

print("Loading Preictal data...")
preictal_df = pd.read_csv(preictal_file_path)

print("Loading Ictal data...")
ictal_df = pd.read_csv(ictal_file_path)

print("Combining datasets...")
combined_df = pd.concat([preictal_df, ictal_df], axis=0).progress_apply(lambda x: x)

print("Shuffling combined dataset...")
shuffled_df = shuffle(combined_df, random_state=42)

shuffled_df.to_csv(output_file_path, index=False)

print(f"Combined and shuffled dataset saved to {output_file_path}")
