In [1]:
import sys; sys.path.append(2 * "../")

# Packages
import pandas as pd
import numpy as np

from src.utils import (
    calculate_1d_statistic,
    calculate_all_statistic,
    convert_statistic_to_csv,
)

# Repository root path
root = 2 * "../"

### Minute Data Statistic Analysis

In [2]:
minute_data = pd.read_csv(root + "data/ppm/ori/minute.csv")
print(f"Minute Data Range: {minute_data['DateTime'][0]} ~ {minute_data['DateTime'][minute_data.shape[0]-1]}")
print(f"Minute Data Shape: {minute_data.shape}")
minute_data.head()

Minute Data Range: 2016-01-01 12:04:00 ~ 2021-12-31 23:59:00
Minute Data Shape: (3086701, 13)


Unnamed: 0,DateTime,Raw_Flow,Raw_temperature,Raw_electrical conductivity,Raw_turbidity,Raw_alkalinity,Raw_pH,Raw_TOC,PAC,Pre_chlorine,Sed_chlorine,Sed_turbidity,Mid_chlorine
0,2016-01-01 12:04:00,2375.0,8.9,188.053097,2.025,33.25,7.175,1.668937,13.721414,2.3,1.03,0.27,0.5
1,2016-01-01 12:05:00,2375.0,8.875,188.191882,2.05,33.25,7.175,1.668937,13.236815,2.28,1.04,0.27,0.5
2,2016-01-01 12:06:00,2383.75,8.875,188.191882,2.05,33.25,7.1785,1.668937,12.605561,2.28,1.04,0.27,0.5
3,2016-01-01 12:07:00,2376.25,8.875,188.560886,2.05,33.25,7.1785,1.668937,11.576227,2.29,1.04,0.27,0.5
4,2016-01-01 12:08:00,2382.5,8.875,188.191882,2.0,33.25,7.175,1.668937,11.600207,2.31,1.04,0.27,0.5


In [3]:
# Calculate the statistics for one feature
stats = calculate_1d_statistic(minute_data["Raw_Flow"])

In [4]:
# The statistics are stored in a dictionary, containing the following
# keys: 'mean', 'std', 'min', '25%', '50%', '75%', 'max', etc.
stats.keys()

dict_keys(['mean', 'std', 'variance', 'min', 'max', 'range', 'median', 'q25', 'q75', 'iqr', 'lag_1_correlation', 'lag_2_correlation', 'lag_3_correlation', 'lag_4_correlation', 'lag_5_correlation', 'serial_correlation', 'skewness', 'kurtosis', 'is_normal', 'autocorr_lag1', 'autocorr_lag2', 'autocorr_lag3', 'trend', 'rolling_mean_diff', 'rolling_std_diff'])

In [5]:
# Calculate the statistics for all features except 'DateTime'
stats = calculate_all_statistic(minute_data.drop(columns=["DateTime"]))

# Save the statistics to a CSV file
convert_statistic_to_csv(stats, root+"results/ppm/02.minute_data_stats.csv")

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:09<00:00,  1.27it/s]


In [None]:
# Load and check the statistics
stats = pd.read_csv(root+"results/ppm/03.minute_data_stats.csv")
stats.head()

Unnamed: 0,feature,mean,std,variance,min,max,range,median,q25,q75,...,serial_correlation,skewness,kurtosis,is_normal,autocorr_lag1,autocorr_lag2,autocorr_lag3,trend,rolling_mean_diff,rolling_std_diff
0,Raw_Flow,2686.633547,232.938633,54260.406864,1553.272,3822.524,2269.252,2703.665,2513.75,2838.75,...,0.992294,0.006401,1.080627,0.0,0.992294,0.98975,0.986423,4.424926e-05,-60.200216,-0.149648
1,Raw_temperature,13.467853,5.054064,25.54356,5.1,23.33,18.23,12.5,8.981,18.244,...,0.999981,0.232551,-1.301252,0.0,0.99998,0.999976,0.999977,3.449876e-07,0.371469,0.58331
2,Raw_electrical conductivity,188.994969,35.594298,1266.95408,91.827615,276.218002,184.390387,193.621134,164.496036,215.545079,...,0.999312,-0.408299,-0.533709,0.0,0.999312,0.998762,0.998274,-3.613121e-06,4.908232,4.213572
3,Raw_turbidity,2.62653,4.396237,19.326903,0.206,60.206,60.0,1.25,0.98,1.931,...,0.999709,5.175664,36.098684,0.0,0.999709,0.999612,0.999585,-7.733148e-08,-0.01427,-1.730959
4,Raw_alkalinity,35.515682,8.151473,66.446506,10.319,57.5,47.181,35.75,30.9,41.73,...,0.999807,-0.70453,0.530925,0.0,0.999807,0.999621,0.999436,-4.667195e-06,8.305092,-0.896938


### Hour Data Statistic Analysis

In [7]:
hour_data = pd.read_csv(root + "data/ppm/processed/hour.csv")
print(f"Hour Data Range: {hour_data['DateTime'][0]} ~ {hour_data['DateTime'][hour_data.shape[0]-1]}")
print(f"Hour Data Shape: {hour_data.shape}")
hour_data.head()

Hour Data Range: 2016-01-01 12:00:00 ~ 2021-12-31 23:00:00
Hour Data Shape: (52596, 13)


Unnamed: 0,DateTime,Raw_Flow,Raw_temperature,Raw_electrical conductivity,Raw_turbidity,Raw_alkalinity,Raw_pH,Raw_TOC,PAC,Pre_chlorine,Sed_chlorine,Sed_turbidity,Mid_chlorine
0,2016-01-01 12:00:00,2375.0,8.907143,186.921133,1.928125,33.250446,7.175125,1.668937,10.961077,2.291607,1.03875,0.27,0.513036
1,2016-01-01 13:00:00,2576.25,8.935417,185.874105,1.697917,35.263333,7.170042,1.668937,10.942969,2.291667,1.050667,0.27,0.53
2,2016-01-01 14:00:00,2586.25,8.954167,187.624267,1.859167,32.96375,7.162517,1.668937,10.876959,2.291167,1.064333,0.27,0.533
3,2016-01-01 15:00:00,2525.0,8.957083,185.638802,2.089167,33.22875,7.173308,1.668937,10.99469,2.291167,1.03,0.270333,0.5255
4,2016-01-01 16:00:00,2455.0,8.956667,183.033574,2.204167,32.404167,7.182467,1.668937,11.109072,2.293167,1.0165,0.2715,0.5345


In [8]:
# Calculate the statistics for all features except 'DateTime'
stats = calculate_all_statistic(hour_data.drop(columns=["DateTime"]))

# Save the statistics to a CSV file
convert_statistic_to_csv(stats, root+"results/ppm/02.hour_data_stats.csv")

100%|██████████| 12/12 [00:00<00:00, 150.40it/s]


In [None]:
# Load and check the statistics
stats = pd.read_csv(root+"results/ppm/03.hour_data_stats.csv")
stats.head()

Unnamed: 0,feature,mean,std,variance,min,max,range,median,q25,q75,...,serial_correlation,skewness,kurtosis,is_normal,autocorr_lag1,autocorr_lag2,autocorr_lag3,trend,rolling_mean_diff,rolling_std_diff
0,Raw_Flow,2676.946544,239.32333,57275.656215,1558.584,3812.262,2253.678,2696.7005,2502.5,2836.25,...,0.784837,-0.013714,0.838294,7.304834e-173,0.784822,0.629759,0.534031,0.003187,-78.771685,13.194652
1,Raw_temperature,13.395478,5.02304,25.230935,5.129167,23.111167,17.982,12.315,9.056917,18.086667,...,0.99994,0.267597,-1.277441,0.0,0.999927,0.999808,0.999669,2.3e-05,0.438393,0.519881
2,Raw_electrical conductivity,189.460705,35.276507,1244.431962,94.004339,272.291355,178.287016,194.761261,165.23424,216.744606,...,0.995959,-0.446657,-0.492601,0.0,0.995937,0.992694,0.990598,-0.000223,4.22544,3.57347
3,Raw_turbidity,2.61344,4.347092,18.89721,0.325,53.01005,52.68505,1.2525,0.985,1.985275,...,0.999064,5.23824,36.988696,0.0,0.999063,0.998078,0.997166,-5e-06,0.023222,-1.722119
4,Raw_alkalinity,35.698678,8.149534,66.41491,10.34405,55.527917,45.183867,35.920833,31.073792,41.789062,...,0.994052,-0.729998,0.544332,0.0,0.994051,0.990462,0.989395,-0.000275,8.494863,-0.870694
