In [3]:
import pandas as pd
df_combined = pd.read_csv('cyano_culture_datasets/04-15-2025 culture/combined_dataset.csv')
df_combined

Unnamed: 0,unixtime,datetime,temperature (C),pH,dO,red_intensity,green_intensity,blue_intensity,color_combined
0,1744741273,2025-04-15 18:21:13.543205,,7.343,0.00,,,,
1,1744741274,2025-04-15 18:21:14.532573,,7.343,0.00,,,,
2,1744741275,2025-04-15 18:21:15.519331,,7.344,0.00,,,,
3,1744741276,2025-04-15 18:21:16.507702,,7.344,0.00,,,,
4,1744741277,2025-04-15 18:21:17.494259,,7.343,0.00,,,,
...,...,...,...,...,...,...,...,...,...
693024,1745406761,2025-04-23 11:12:41.293880,28.98,7.064,7.78,,,,
693025,1745406762,2025-04-23 11:12:42.279130,28.97,7.063,7.78,,,,
693026,1745406763,2025-04-23 11:12:43.266820,28.97,7.062,7.78,,,,
693027,1745406764,2025-04-23 11:12:44.255699,28.98,7.061,7.79,,,,


In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 1. Load and examine the dataset
df = pd.read_csv('cyano_culture_datasets/04-15-2025 culture/combined_dataset.csv')
print(f"Original dataset shape: {df.shape}")

# 2. Convert datetime to proper format and set as index for time-based operations
df['datetime'] = pd.to_datetime(df['datetime'])

# 3. Perform nearest neighbor imputation based on unixtime
# First, we'll focus on the columns we're interested in
features_of_interest = ['temperature (C)', 'pH', 'dO', 'green_intensity']
df_subset = df[['unixtime'] + features_of_interest].copy()

# Before imputation, let's count the missing values
print("Missing values before imputation:")
print(df_subset[features_of_interest].isna().sum())

# Prepare for KNN imputation - we'll use unixtime as a feature for finding nearest neighbors
# First normalize unixtime to avoid it dominating the distance calculation
scaler = StandardScaler()
unixtime_scaled = scaler.fit_transform(df_subset[['unixtime']])

# Create a dataframe with scaled unixtime and the features we want to impute
imputation_df = pd.DataFrame(unixtime_scaled, columns=['unixtime_scaled'])
for col in features_of_interest:
    imputation_df[col] = df_subset[col]

# Perform KNN imputation
imputer = KNNImputer(n_neighbors=5)
imputed_array = imputer.fit_transform(imputation_df)

# Convert back to dataframe
imputed_df = pd.DataFrame(imputed_array, columns=imputation_df.columns)

# Replace the original features with imputed values (except the scaled unixtime)
for col in features_of_interest:
    df_subset[col] = imputed_df[col]

# Put the imputed values back into the original dataframe
for col in features_of_interest:
    df[col] = df_subset[col]

# Check missing values after imputation
print("\nMissing values after imputation:")
print(df[features_of_interest].isna().sum())

# 4. Compute hourly summary statistics for temperature, pH, and dO
# First, let's identify rows with non-NaN green_intensity values (before imputation)
# We'll use these as hourly markers

# Create a boolean mask for rows with non-NaN green_intensity in original data
original_green_intensity = pd.read_csv('cyano_culture_datasets/04-15-2025 culture/combined_dataset.csv')['green_intensity']
valid_green_intensity = ~original_green_intensity.isna()

# Get indices of rows with valid green_intensity
valid_indices = np.where(valid_green_intensity)[0]

# Function to compute summary statistics for a segment of data
def compute_segment_stats(segment, feature):
    if len(segment) == 0:
        return {
            f'{feature}_mean': np.nan,
            f'{feature}_min': np.nan,
            f'{feature}_max': np.nan,
            f'{feature}_std': np.nan,
            f'{feature}_range': np.nan,
            f'{feature}_rate_of_change': np.nan
        }
    
    # Basic statistics
    stats = {
        f'{feature}_mean': segment.mean(),
        f'{feature}_min': segment.min(),
        f'{feature}_max': segment.max(),
        f'{feature}_std': segment.std(),
        f'{feature}_range': segment.max() - segment.min()
    }
    
    # Rate of change (if there are multiple points)
    if len(segment) > 1:
        # Simple linear regression to get slope
        x = np.arange(len(segment))
        y = segment.values
        if np.std(y) > 0:  # Avoid division by zero
            slope = np.polyfit(x, y, 1)[0]
            stats[f'{feature}_rate_of_change'] = slope
        else:
            stats[f'{feature}_rate_of_change'] = 0
    else:
        stats[f'{feature}_rate_of_change'] = 0
        
    return stats

# Create a new dataframe to store hourly summary metrics
hourly_summary = []

# For each valid green_intensity reading, compute summary statistics for preceding hour
for i in range(len(valid_indices)):
    current_idx = valid_indices[i]
    
    # Define start index (previous valid reading or beginning of dataset)
    if i > 0:
        start_idx = valid_indices[i-1] + 1
    else:
        start_idx = 0
    
    # Get segment of data
    segment = df.iloc[start_idx:current_idx+1]
    
    # Record the timestamp for this summary
    summary_row = {
        'unixtime': df.iloc[current_idx]['unixtime'],
        'datetime': df.iloc[current_idx]['datetime'],
        'green_intensity': df.iloc[current_idx]['green_intensity']
    }
    
    # Compute statistics for each feature
    for feature in ['temperature (C)', 'pH', 'dO']:
        feature_stats = compute_segment_stats(segment[feature], feature)
        summary_row.update(feature_stats)
    
    hourly_summary.append(summary_row)

# Convert to DataFrame
hourly_df = pd.DataFrame(hourly_summary)

# Display the resulting hourly summary dataframe
print("\nHourly summary dataframe:")
print(hourly_df.head())
print(f"Hourly summary shape: {hourly_df.shape}")

# Save the processed data
df.to_csv('imputed_dataset.csv', index=False)
hourly_df.to_csv('hourly_summary.csv', index=False)

# 5. Visualization to verify imputation and summary
plt.figure(figsize=(15, 10))

# Plot imputed temperature
plt.subplot(4, 1, 1)
plt.plot(df['datetime'], df['temperature (C)'])
plt.title('Imputed Temperature')
plt.ylabel('Temperature (C)')

# Plot imputed pH
plt.subplot(4, 1, 2)
plt.plot(df['datetime'], df['pH'])
plt.title('Imputed pH')
plt.ylabel('pH')

# Plot imputed dO
plt.subplot(4, 1, 3)
plt.plot(df['datetime'], df['dO'])
plt.title('Imputed dO')
plt.ylabel('dO')

# Plot green_intensity
plt.subplot(4, 1, 4)
plt.plot(df['datetime'], df['green_intensity'])
plt.title('Green Intensity')
plt.ylabel('Green Intensity')

plt.tight_layout()
plt.savefig('imputed_data_visualization.png')

# Plot summary statistics
plt.figure(figsize=(15, 15))

# Temperature summary
plt.subplot(3, 3, 1)
plt.plot(hourly_df['datetime'], hourly_df['temperature (C)_mean'])
plt.title('Hourly Mean Temperature')
plt.xticks(rotation=45)

plt.subplot(3, 3, 2)
plt.plot(hourly_df['datetime'], hourly_df['temperature (C)_min'], 'g-', 
         hourly_df['datetime'], hourly_df['temperature (C)_max'], 'r-')
plt.title('Hourly Min/Max Temperature')
plt.xticks(rotation=45)
plt.legend(['Min', 'Max'])

plt.subplot(3, 3, 3)
plt.plot(hourly_df['datetime'], hourly_df['temperature (C)_rate_of_change'])
plt.title('Temperature Rate of Change')
plt.xticks(rotation=45)

# pH summary
plt.subplot(3, 3, 4)
plt.plot(hourly_df['datetime'], hourly_df['pH_mean'])
plt.title('Hourly Mean pH')
plt.xticks(rotation=45)

plt.subplot(3, 3, 5)
plt.plot(hourly_df['datetime'], hourly_df['pH_min'], 'g-',
         hourly_df['datetime'], hourly_df['pH_max'], 'r-')
plt.title('Hourly Min/Max pH')
plt.xticks(rotation=45)
plt.legend(['Min', 'Max'])

plt.subplot(3, 3, 6)
plt.plot(hourly_df['datetime'], hourly_df['pH_rate_of_change'])
plt.title('pH Rate of Change')
plt.xticks(rotation=45)

# dO summary
plt.subplot(3, 3, 7)
plt.plot(hourly_df['datetime'], hourly_df['dO_mean'])
plt.title('Hourly Mean dO')
plt.xticks(rotation=45)

plt.subplot(3, 3, 8)
plt.plot(hourly_df['datetime'], hourly_df['dO_min'], 'g-',
         hourly_df['datetime'], hourly_df['dO_max'], 'r-')
plt.title('Hourly Min/Max dO')
plt.xticks(rotation=45)
plt.legend(['Min', 'Max'])

plt.subplot(3, 3, 9)
plt.plot(hourly_df['datetime'], hourly_df['dO_rate_of_change'])
plt.title('dO Rate of Change')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('hourly_summary_visualization.png')

print("\nProcessing complete. Files saved: imputed_dataset.csv, hourly_summary.csv")

ModuleNotFoundError: No module named 'sklearn'

# Okay the big question is whether or not we want to do Data Imputation...

# So we'll wanna do some sort of normalization probably

In [4]:
# normalize the data using standardscaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_combined.iloc[:, 3:] = scaler.fit_transform(df_combined.iloc[:, 1:])
df_combined

ModuleNotFoundError: No module named 'sklearn'