In [32]:
import pyedflib as plib
import numpy as np
import pandas as pd
import os

In [33]:
edf_file_path = 'sleep-cassette/SC4001E0-PSG.edf'


signals, signals_headers, header = plib.highlevel.read_edf(edf_file=edf_file_path)

In [34]:
len(signals)

7

In [35]:
signals[6][6]

891.0

In [None]:
header['annotations']
# 82680.0

## Generamos los excells para entrenar

#### Creamos las parejas de los ficheros

In [36]:
len(os.listdir("sleep-telemetry/"))

88

In [37]:
# Path to the folder containing the files
folder_path = "sleep-cassette/"

# Dictionary to store file pairs grouped by their prefix
file_groups = {}

# Iterate over the files in the folder
for filename in os.listdir(folder_path):
    # Split the filename into prefix and extension
    prefix, extension = os.path.splitext(filename)
    # If the file has the expected format
    if extension == ".edf":
        # Get the first 7 characters of the filename
        prefix = prefix[:7]
        # Add the file to the corresponding group in the dictionary
        if prefix not in file_groups:
            file_groups[prefix] = []
        file_groups[prefix].append(filename)

# Convert file groups into a list of tuples
file_pairs = [(file_groups[prefix][0], file_groups[prefix][1])
              for prefix in file_groups if len(file_groups[prefix]) == 2]

# Print the list of tuples
print(file_pairs)

[('SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf'), ('SC4002E0-PSG.edf', 'SC4002EC-Hypnogram.edf'), ('SC4011E0-PSG.edf', 'SC4011EH-Hypnogram.edf'), ('SC4012E0-PSG.edf', 'SC4012EC-Hypnogram.edf'), ('SC4021E0-PSG.edf', 'SC4021EH-Hypnogram.edf'), ('SC4022E0-PSG.edf', 'SC4022EJ-Hypnogram.edf'), ('SC4031E0-PSG.edf', 'SC4031EC-Hypnogram.edf'), ('SC4032E0-PSG.edf', 'SC4032EP-Hypnogram.edf'), ('SC4041E0-PSG.edf', 'SC4041EC-Hypnogram.edf'), ('SC4042E0-PSG.edf', 'SC4042EC-Hypnogram.edf'), ('SC4051E0-PSG.edf', 'SC4051EC-Hypnogram.edf'), ('SC4052E0-PSG.edf', 'SC4052EC-Hypnogram.edf'), ('SC4061E0-PSG.edf', 'SC4061EC-Hypnogram.edf'), ('SC4062E0-PSG.edf', 'SC4062EC-Hypnogram.edf'), ('SC4071E0-PSG.edf', 'SC4071EC-Hypnogram.edf'), ('SC4072E0-PSG.edf', 'SC4072EH-Hypnogram.edf'), ('SC4081E0-PSG.edf', 'SC4081EC-Hypnogram.edf'), ('SC4082E0-PSG.edf', 'SC4082EP-Hypnogram.edf'), ('SC4091E0-PSG.edf', 'SC4091EC-Hypnogram.edf'), ('SC4092E0-PSG.edf', 'SC4092EC-Hypnogram.edf'), ('SC4101E0-PSG.edf', 'SC4101EC-Hypnogra

#### Recorermos la lista y generamos los excels

In [38]:
len(file_pairs)

151

In [39]:
for psg_file, hypnogram_file in file_pairs[:]:
    signals_stage = []

    # Construct the full paths to the .edf files
    psg_file_path = os.path.join(folder_path, psg_file)
    hypnogram_file_path = os.path.join(folder_path, hypnogram_file)
    
    # Load the PSG and Hypnogram files
    signals_psg, signals_headers_psg, header_psg = plib.highlevel.read_edf(edf_file=psg_file_path)
    _, _, header_hypnogram = plib.highlevel.read_edf(edf_file=hypnogram_file_path)

    user_id = psg_file[3:5]
    user_night = psg_file[5]
    sex = header_hypnogram['sex'] if 'sex' in header_hypnogram else 'Unknown'
    patient_age = header_hypnogram['patientname'].split(' ')[-1].strip('yr') if 'patientname' in header_hypnogram else 'Unknown'

    for annotation in header_hypnogram['annotations'][:-1]:
        # Extract the start index, number of signals, and label from the annotation
        start_index = int(annotation[0])
        num_signals = int(annotation[1])
        label = annotation[2][-1]
        
        # Extract the PSG signals corresponding to the current sleep stage
        for i in range(start_index, (start_index + num_signals) -1 ):
            sensor_1 = np.mean(signals_psg[0][(i * 100) : (i * 100) + 100])
            sensor_2 = np.mean(signals_psg[1][(i * 100) : (i * 100) + 100])
            sensor_3 = np.mean(signals_psg[2][(i * 100) : (i * 100) + 100])
            sensor_4 = signals_psg[3][i]
            sensor_5 = signals_psg[4][i]
            sensor_6 = signals_psg[5][i]
            sensor_7 = signals_psg[6][i]
            
            signals_stage.append({
                'sensor_1': sensor_1,
                'sensor_2': sensor_2,
                'sensor_3': sensor_3,
                'sensor_4': sensor_4,
                'sensor_5': sensor_5,
                'sensor_6': sensor_6,
                'sensor_7': sensor_7,
                'label': label,
                'user_id': user_id,
                'user_night': user_night,
                'sex': sex,
                'patient_age': patient_age,
            })

    data = pd.DataFrame(signals_stage)

    # Save the DataFrame to a CSV file
    data.to_csv(f'sleep-cassette-csv/{user_id}-SC-EEG.csv', sep=';', index=False)

In [19]:
signals_stage

[{'sensor_1': -8.405802197802199,
  'sensor_2': 0.4155604395604295,
  'sensor_3': -1.3502612942612942,
  'sensor_4': -482.0,
  'sensor_5': 3.552,
  'sensor_6': 37.20645161290322,
  'sensor_7': 920.0,
  'user_id': '00',
  'user_night': '1',
  'sex': 'Female',
  'patient_age': '33'},
 {'sensor_1': 9.57796336996337,
  'sensor_2': 4.73040293040292,
  'sensor_3': -24.940410256410257,
  'sensor_4': 628.0,
  'sensor_5': 3.49,
  'sensor_6': 37.22795698924731,
  'sensor_7': 899.0,
  'user_id': '00',
  'user_night': '1',
  'sex': 'Female',
  'patient_age': '33'},
 {'sensor_1': -12.39396336996337,
  'sensor_2': -8.03273992673994,
  'sensor_3': 58.88420512820513,
  'sensor_4': -95.0,
  'sensor_5': 3.476,
  'sensor_6': 37.236559139784944,
  'sensor_7': 919.0,
  'user_id': '00',
  'user_night': '1',
  'sex': 'Female',
  'patient_age': '33'},
 {'sensor_1': 14.914578754578756,
  'sensor_2': 4.62003663003662,
  'sensor_3': 47.77165323565323,
  'sensor_4': -663.0,
  'sensor_5': 3.642,
  'sensor_6': 37.2

In [24]:
import pandas as pd

# Define column names

# Create the DataFrame with data and column names
data = pd.DataFrame(signals_stage)

# Save the DataFrame to a CSV file
data.to_csv('prueba.csv', sep=';', index=False)

In [23]:
data

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,user_id,user_night,sex,patient_age
0,-8.405802,0.415560,-1.350261,-482.0,3.552,37.206452,920.0,00,1,Female,33
1,9.577963,4.730403,-24.940410,628.0,3.490,37.227957,899.0,00,1,Female,33
2,-12.393963,-8.032740,58.884205,-95.0,3.476,37.236559,919.0,00,1,Female,33
3,14.914579,4.620037,47.771653,-663.0,3.642,37.223656,912.0,00,1,Female,33
4,8.250139,1.687172,-179.875502,137.0,3.478,37.230108,926.0,00,1,Female,33
...,...,...,...,...,...,...,...,...,...,...,...
79342,-4.288234,0.163158,3.671331,111.0,3.628,37.481720,880.0,00,1,Female,33
79343,11.446857,2.467414,-13.551893,130.0,3.530,37.434409,904.0,00,1,Female,33
79344,4.838681,2.824425,-62.624527,128.0,3.574,37.415054,872.0,00,1,Female,33
79345,-8.468630,4.916586,9.486325,145.0,3.546,37.439785,884.0,00,1,Female,33


In [15]:
data['label'].value_counts(normalize=True)

W    0.684837
2    0.167347
R    0.068672
1    0.034622
3    0.024552
4    0.015303
?    0.004227
e    0.000440
Name: label, dtype: float64

## Generamos los estadisticos de Cassete

In [None]:
import glob
import os
from matplotlib import pyplot as plt
# Define the pattern to match files ending with '-Hypnogram.edf' in 'sleep-cassette' folder
pattern = 'sleep-cassette/*-Hypnogram.edf'

# Use glob.glob to find all files matching the pattern
file_paths = glob.glob(pattern)

# Initialize a list to store the extracted data
data = []

# Iterate over the list of file paths
for file_path in file_paths:
    # Extract user_id from the file basename, assuming the format '...SC4XX...'
    basename = os.path.basename(file_path)
    user_id = basename[3:5]
    user_night = basename[5]
    
    # Read the EDF file
    _, _, header = plib.highlevel.read_edf(edf_file=file_path)

    # Extract the needed information
    sex = header['sex'] if 'sex' in header else 'Unknown'
    patient_age = header['patientname'].split(' ')[-1].strip('yr') if 'patientname' in header else 'Unknown'
    
    # Append the information to the data list
    for sample in header['annotations']: 
        data.append({
            'user_id': int(user_id),
            'user_night': int(user_night),
            'sex': sex,
            'patient_age': int(patient_age),
            'num_seq': int(sample[1]),
            'sleep_stage': sample[2].split(' ')[-1]
        })


In [None]:
df = pd.DataFrame(data)
df

### Number of unics users

In [None]:
# Count unique males and females in the dataset
unique_males = df[df['sex'] == 'Male']['user_id'].nunique()
unique_females = df[df['sex'] == 'Female']['user_id'].nunique()

print(f"Number of unic users: {len(df['user_id'].unique())}")
print(f"Unique Males: {unique_males}")
print(f"Unique Females: {unique_females}")

### Distribution Age and Sex

In [None]:
# Assuming 'df' is your DataFrame

# Step 1: Bin ages (adjust bins according to your data's age distribution)
age_bins = range(df['patient_age'].min(), df['patient_age'].max() + 10, 10)  # Adjust the bin size as needed
df['age_group'] = pd.cut(df['patient_age'], bins=age_bins, right=False)

# Step 2: Count unique user_ids within each (age group, sex) combination
grouped = df.groupby(['age_group', 'sex'])['user_id'].nunique().unstack(fill_value=0)

colors = ['orange' if sex == 'Female' else 'lightblue' for sex in grouped.columns]

# Step 3: Plot
grouped.plot(kind='bar', stacked=True, figsize=(10, 6), width=0.8, color=colors)
plt.title('Density of Unique User IDs by Age Group and Sex')
plt.xlabel('Age Group')
plt.ylabel('Unique User Count')
plt.xticks(rotation=45)
plt.legend(title='Sex')
plt.tight_layout()
plt.show()

In [None]:
grouped_df = df.groupby(['user_id', 'user_night', 'sleep_stage']).agg({'num_seq':'sum'})

grouped_df

In [None]:
grouped_df.unstack(level='sleep_stage').plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Number of Sequences by User, Night, and Sleep Stage')
plt.xlabel('User ID, User Night')
plt.ylabel('Number of Sequences')
plt.show()

In [None]:
import seaborn as sns

# Prepare the data
pivot_table = grouped_df.reset_index().pivot_table(index=['user_id', 'user_night'], columns='sleep_stage', values='num_seq', fill_value=0)

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table, annot=True, cmap='viridis')
plt.title('Heatmap of Sequences by User, Night, and Sleep Stage')
plt.ylabel('User ID, User Night')
plt.xlabel('Sleep Stage')
plt.show()

In [None]:
for user_id in df['user_id'].unique():
    user_df = grouped_df.xs(user_id, level='user_id')
    user_df.unstack(level='sleep_stage').plot(kind='line', marker='o', figsize=(10, 6))
    plt.title(f'Sequences Over Nights for User {user_id}')
    plt.ylabel('Number of Sequences')
    plt.xlabel('User Night')
    plt.legend(title='Sleep Stage')
    plt.show()