In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

data = pd.read_csv('processed_dataset.csv')

# Verify time column (replace 'time' with the actual column name)
time_column = 'time'  # Adjust based on your dataset
patient_column = 'p_num'  # Adjust based on your dataset

unique_times = data.groupby(patient_column)[time_column].unique()

# Check if all patients have the same time intervals
reference_time = unique_times.iloc[0]  # Use the first patient's times as a reference
consistent_times = all((set(reference_time) == set(times)) for times in unique_times)

if consistent_times:
    print("All patients have consistent time intervals.")
else:
    print("Time intervals are inconsistent across patients.")


  data = pd.read_csv('processed_dataset.csv')


All patients have consistent time intervals.


In [2]:
# Identify all columns starting with 'carbs-'
carbs_cols = [col for col in data.columns if col.startswith('carbs-')]

# Drop these columns
data.drop(columns=carbs_cols, inplace=True)

# Confirmation
print(f"Dropped {len(carbs_cols)} columns related to 'carbs'.")
print(data.columns.tolist())  # Verify remaining columns


Dropped 72 columns related to 'carbs'.
['id', 'p_num', 'time', 'bg-5:55', 'bg-5:50', 'bg-5:45', 'bg-5:40', 'bg-5:35', 'bg-5:30', 'bg-5:25', 'bg-5:20', 'bg-5:15', 'bg-5:10', 'bg-5:05', 'bg-5:00', 'bg-4:55', 'bg-4:50', 'bg-4:45', 'bg-4:40', 'bg-4:35', 'bg-4:30', 'bg-4:25', 'bg-4:20', 'bg-4:15', 'bg-4:10', 'bg-4:05', 'bg-4:00', 'bg-3:55', 'bg-3:50', 'bg-3:45', 'bg-3:40', 'bg-3:35', 'bg-3:30', 'bg-3:25', 'bg-3:20', 'bg-3:15', 'bg-3:10', 'bg-3:05', 'bg-3:00', 'bg-2:55', 'bg-2:50', 'bg-2:45', 'bg-2:40', 'bg-2:35', 'bg-2:30', 'bg-2:25', 'bg-2:20', 'bg-2:15', 'bg-2:10', 'bg-2:05', 'bg-2:00', 'bg-1:55', 'bg-1:50', 'bg-1:45', 'bg-1:40', 'bg-1:35', 'bg-1:30', 'bg-1:25', 'bg-1:20', 'bg-1:15', 'bg-1:10', 'bg-1:05', 'bg-1:00', 'bg-0:55', 'bg-0:50', 'bg-0:45', 'bg-0:40', 'bg-0:35', 'bg-0:30', 'bg-0:25', 'bg-0:20', 'bg-0:15', 'bg-0:10', 'bg-0:05', 'bg-0:00', 'insulin-5:55', 'insulin-5:50', 'insulin-5:45', 'insulin-5:40', 'insulin-5:35', 'insulin-5:30', 'insulin-5:25', 'insulin-5:20', 'insulin-5:15', '

In [3]:
summary = data.groupby('p_num')['time'].apply(lambda x: sorted(x.unique()))

summary_df = summary.reset_index()
summary_df.columns = ['p_num', 'unique_times']
summary_df['time_count'] = summary_df['unique_times'].apply(len)
print(summary_df)


  p_num                                       unique_times  time_count
0   p01  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
1   p02  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
2   p03  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
3   p04  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
4   p05  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
5   p06  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
6   p10  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
7   p11  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288
8   p12  [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...         288


In [4]:
# Check for missing values
missing_summary = data.isnull().sum().reset_index()
missing_summary.columns = ['Column', 'Missing Count']
missing_summary['Missing Percentage'] = (missing_summary['Missing Count'] / len(data)) * 100
print(missing_summary.sort_values(by='Missing Percentage', ascending=False))


            Column  Missing Count  Missing Percentage
392  activity-3:30          73664           98.755899
389  activity-3:45          73663           98.754558
386  activity-4:00          73661           98.751877
395  activity-3:15          73660           98.750536
407  activity-2:15          73660           98.750536
..             ...            ...                 ...
74         bg-0:00           2233            2.993619
0               id              0            0.000000
1            p_num              0            0.000000
2             time              0            0.000000
435        bg+1:00              0            0.000000

[436 rows x 3 columns]


### Double check that times match up 

In [5]:
# Check unique times per patient
time_summary = data.groupby('p_num')['time'].apply(lambda x: sorted(x.unique()))
print(time_summary)

# Align times
common_times = set.intersection(*map(set, time_summary))  # Find common times across all patients
aligned_train = data[data['time'].isin(common_times)]  # Keep only rows with common times
print(f"Aligned dataset contains {len(aligned_train)} rows.")


p_num
p01    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p02    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p03    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p04    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p05    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p06    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p10    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p11    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
p12    [00:00:00, 00:05:00, 00:10:00, 00:15:00, 00:20...
Name: time, dtype: object
Aligned dataset contains 74592 rows.


In [6]:
data.shape

(74592, 436)

Step 3: Handle Missing Data
Continuous Features (e.g., bg-X:XX, hr-X:XX):
Use forward fill, backward fill, or linear interpolation to fill gaps in time-series data.

In [7]:
# # Identify continuous columns (e.g., bg, hr, etc.)
# continuous_cols = [col for col in data.columns if 'bg-' in col or 'hr-' in col]

# # Forward and backward fill for continuous columns
# data[continuous_cols] = data[continuous_cols].ffill().bfill()

# # Confirmation message
# print("Forward and backward filling for continuous columns is complete!")


In [8]:
# Define bg-X:XX and hr-X:XX columns
bg_cols = [col for col in data.columns if col.startswith('bg-')]
hr_cols = [col for col in data.columns if col.startswith('hr-')]

# Apply forward/backward fill first
data[bg_cols] = data[bg_cols].ffill().bfill()
data[hr_cols] = data[hr_cols].ffill().bfill()

# Use KNN for remaining missing values (if any)
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
data[bg_cols + hr_cols] = imputer.fit_transform(data[bg_cols + hr_cols])

# Confirmation
print("Missing values in bg-X:XX and hr-X:XX have been handled.")


Missing values in bg-X:XX and hr-X:XX have been handled.


Activity Columns (activity-X:XX):
Replace missing activities with 'unknown' or the most frequent activity.

In [9]:
# activity_cols = [col for col in data.columns if 'activity-' in col]
# data[activity_cols] = data[activity_cols].fillna('unknown')

# # Confirmation message
# print("Missing activity values have been filled with 'unknown'.")


In [10]:
# insulin_cols = [col for col in data.columns if 'insulin-' in col]
# carbs_cols = [col for col in data.columns if 'carbs-' in col]
# data[insulin_cols + carbs_cols] = data[insulin_cols + carbs_cols].fillna(0)

In [11]:
# Re-check missing data

In [12]:
# Count total missing values across the entire dataset
total_missing = data.isnull().sum().sum()
print(f"Total missing values in the dataset: {total_missing}")

Total missing values in the dataset: 10796047


In [13]:
# Count missing values for each column
missing_by_column = data.isnull().sum()
print(missing_by_column[missing_by_column > 0])  # Display only columns with missing values


insulin-5:55      8288
insulin-5:50      8288
insulin-5:45      8288
insulin-5:40      8288
insulin-5:35      8288
                 ...  
activity-0:20    73610
activity-0:15    73629
activity-0:10    73623
activity-0:05    73607
activity-0:00    73621
Length: 288, dtype: int64


In [14]:
# Calculate the percentage of missing values for each column
missing_percentage = (data.isnull().sum() / len(data)) * 100
print(missing_percentage[missing_percentage > 0])  # Display only columns with missing values


insulin-5:55     11.111111
insulin-5:50     11.111111
insulin-5:45     11.111111
insulin-5:40     11.111111
insulin-5:35     11.111111
                   ...    
activity-0:20    98.683505
activity-0:15    98.708977
activity-0:10    98.700933
activity-0:05    98.679483
activity-0:00    98.698252
Length: 288, dtype: float64


## do KNN on rest

In [15]:
from sklearn.impute import KNNImputer

# Columns to exclude from KNN imputation
exclude_cols = ['id', 'p_num', 'time']  # Explicit exclusions
exclude_cols += [col for col in data.columns if col.startswith('insulin-') or col.startswith('activity-')]

# Columns to include in KNN imputation
knn_cols = [col for col in data.columns if col not in exclude_cols]

# Initialize the KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Apply KNN imputation to the selected columns
data[knn_cols] = imputer.fit_transform(data[knn_cols])

# Confirmation
print(f"KNN imputation applied to {len(knn_cols)} columns.")


KNN imputation applied to 289 columns.


In [16]:
# Save the updated dataset to a CSV file
data.to_csv('imputed_dataset.csv', index=False)

print("Dataset saved to 'imputed_dataset.csv'")

Dataset saved to 'imputed_dataset.csv'
