In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

# Load data

In [2]:
date = "09-20-24"
data_location = rf"E:\Project 6 - Temperature\Experiments\data_analysis\{date}"
all_data = pd.read_csv(data_location + rf"\{date}_all_features_combined_renumbered.csv")

# Calculate full period

In [3]:
periods = pd.DataFrame()

for position in all_data["POSITION"].unique():
    position_data = all_data[all_data["POSITION"] == position]
    for track_id in position_data["TRACK_ID"].unique():
        result_df = pd.DataFrame()
        track_data = position_data[position_data["TRACK_ID"] == track_id]
        # Peak-to-peak time differences
        peak_data = track_data[track_data["TYPE"] == "PEAK"]
        peak_data = peak_data.sort_values("TIME")
        peak_to_peak_period = peak_data["TIME"].diff().dropna()
        peak_to_peak_period = peak_to_peak_period.rename("PEAK_TO_PEAK_PERIOD")
        peak_cycle_numbers = peak_data["CYCLE"].values[:-1]
        # Trough-to-trough time differences
        trough_data = track_data[track_data["TYPE"] == "TROUGH"]
        trough_data = trough_data.sort_values("TIME")
        trough_to_trough_period = trough_data["TIME"].diff().dropna()
        trough_to_trough_period = trough_to_trough_period.rename("TROUGH_TO_TROUGH_PERIOD")
        trough_cycle_numbers = trough_data["CYCLE"].values[:-1]
        # If there is a difference in series size, fill in the missing troughs with NaN
        if len(peak_to_peak_period) > len(trough_to_trough_period):
            for i in range(len(peak_to_peak_period) - len(trough_to_trough_period)):
                trough_to_trough_period = pd.concat([trough_to_trough_period, pd.Series([np.nan])], ignore_index=True)
                trough_cycle_numbers = np.append(trough_cycle_numbers, np.nan)
        elif len(peak_to_peak_period) < len(trough_to_trough_period):
            for i in range(len(trough_to_trough_period) - len(peak_to_peak_period)):
                peak_to_peak_period = pd.concat([peak_to_peak_period, pd.Series([np.nan])], ignore_index=True)
                peak_cycle_numbers = np.append(peak_cycle_numbers, np.nan)
        # Add data to result dataframe
        result_df["PEAK_TO_PEAK_PERIOD"] = peak_to_peak_period.values
        result_df["PEAK_TO_PEAK_CYCLE_NUMBER"] = peak_cycle_numbers
        result_df["TROUGH_TO_TROUGH_PERIOD"] = trough_to_trough_period.values
        result_df["TROUGH_TO_TROUGH_CYCLE_NUMBER"] = trough_cycle_numbers
        # Add track_id and position
        result_df["TRACK_ID"] = [track_id] * len(result_df)
        result_df["POSITION"] = [position] * len(result_df)
        # Concatenate data
        periods = pd.concat([periods, result_df], ignore_index=True)

## Add temperature information

In [4]:
temperature = all_data.groupby(["TRACK_ID", "POSITION"])["MEAN_TEMPERATURE"].mean().reset_index()
periods = pd.merge(periods, temperature, on=["TRACK_ID", "POSITION"], how="left")

# Save result

In [5]:
periods.to_csv(data_location + rf"\{date}_full_periods.csv", index=False)

## Check result

In [None]:
periods["PEAK_TO_PEAK_CYCLE_NUMBER_STR"] = periods["PEAK_TO_PEAK_CYCLE_NUMBER"].astype(str)
px.scatter(periods, x="MEAN_TEMPERATURE", y="PEAK_TO_PEAK_PERIOD", color="PEAK_TO_PEAK_CYCLE_NUMBER_STR")

In [None]:
periods["TROUGH_TO_TROUGH_CYCLE_NUMBER_STR"] = periods["TROUGH_TO_TROUGH_CYCLE_NUMBER"].astype(str)
px.scatter(periods, x="MEAN_TEMPERATURE", y="TROUGH_TO_TROUGH_PERIOD", color="TROUGH_TO_TROUGH_CYCLE_NUMBER_STR")

# Calculate peak-to-trough and trough-to-peak periods

In [6]:
# Position to condition mapping
def position_to_condition_mapping(position):
    if position <= 13:
        return 0
    elif position >= 14 and position <= 27:
        return 1
    else:
        return 2

# Condition to peak-trough order mapping
condition_to_peak_trough_order_mapping = {
    0: "TROUGH-PEAK",
    1: "PEAK-TROUGH",
    2: "PEAK-TROUGH"
}

partial_periods = pd.DataFrame()

for position in all_data["POSITION"].unique():
    condition = position_to_condition_mapping(position)
    peak_trough_order = condition_to_peak_trough_order_mapping[condition]
    position_data = all_data[all_data["POSITION"] == position]
    for track_id in position_data["TRACK_ID"].unique():
        result_df = pd.DataFrame()
        track_data = position_data[position_data["TRACK_ID"] == track_id]
        # Peak-to-trough time differences
        peak_data = track_data[track_data["TYPE"] == "PEAK"]
        peak_data = peak_data.sort_values("TIME")
        trough_data = track_data[track_data["TYPE"] == "TROUGH"]
        trough_data = trough_data.sort_values("TIME")
        # See which cycles are present in both peak and trough data
        peak_cycle_numbers = peak_data["CYCLE"].values
        trough_cycle_numbers = trough_data["CYCLE"].values
        common_cycles = np.intersect1d(peak_cycle_numbers, trough_cycle_numbers)
        if len(common_cycles) == 0:
            continue
        # Find the time differences between the common cycles
        peak_to_trough_period = []
        for cycle in common_cycles:
            try:
                if peak_trough_order == "PEAK-TROUGH":
                    peak_time = peak_data[peak_data["CYCLE"] == cycle]["TIME"].values[0]
                    trough_time = trough_data[trough_data["CYCLE"] == cycle]["TIME"].values[0]
                elif peak_trough_order == "TROUGH-PEAK":
                    peak_time = peak_data[peak_data["CYCLE"] == cycle]["TIME"].values[0]
                    trough_time = trough_data[trough_data["CYCLE"] == cycle + 1]["TIME"].values[0]
            except IndexError:
                continue
            # peak_time = peak_data[peak_data["CYCLE"] == cycle]["TIME"].values[0]
            # trough_time = trough_data[trough_data["CYCLE"] == cycle]["TIME"].values[0]
            peak_to_trough_period.append(trough_time - peak_time)
        if len(peak_to_trough_period) == 0:
            continue
        else:
            peak_to_trough_period = pd.Series(peak_to_trough_period)
        # Trough-to-peak time differences
        # We have to take the difference between the peak time of the next cycle and the trough time of the current cycle
        trough_to_peak_period = []
        trough_to_peak_cycle_numbers = []
        for cycle in common_cycles:
            try:
                if peak_trough_order == "PEAK-TROUGH":
                    peak_time = peak_data[peak_data["CYCLE"] == cycle + 1]["TIME"].values[0]
                    trough_time = trough_data[trough_data["CYCLE"] == cycle]["TIME"].values[0]
                elif peak_trough_order == "TROUGH-PEAK":
                    peak_time = peak_data[peak_data["CYCLE"] == cycle]["TIME"].values[0]
                    trough_time = trough_data[trough_data["CYCLE"] == cycle]["TIME"].values[0]
                # peak_time = peak_data[peak_data["CYCLE"] == cycle + 1]["TIME"].values[0]
                # trough_time = trough_data[trough_data["CYCLE"] == cycle]["TIME"].values[0]
                trough_to_peak_period.append(peak_time - trough_time)
                trough_to_peak_cycle_numbers.append(cycle)
            except IndexError:
                pass
        if len(trough_to_peak_period) == 0:
            continue
        else:
            trough_to_peak_period = pd.Series(trough_to_peak_period)
        # If there is a difference in series size, fill in the missing troughs with NaN
        if len(peak_to_trough_period) > len(trough_to_peak_period):
            for i in range(len(peak_to_trough_period) - len(trough_to_peak_period)):
                trough_to_peak_period = pd.concat([trough_to_peak_period, pd.Series([np.nan])], ignore_index=True)
                trough_to_peak_cycle_numbers = np.append(trough_to_peak_cycle_numbers, np.nan)
        elif len(peak_to_trough_period) < len(trough_to_peak_period):
            for i in range(len(trough_to_peak_period) - len(peak_to_trough_period)):
                peak_to_trough_period = pd.concat([peak_to_trough_period, pd.Series([np.nan])], ignore_index=True)
        # Add data to result dataframe
        result_df["PEAK_TO_TROUGH_PERIOD"] = peak_to_trough_period.values
        result_df["TROUGH_TO_PEAK_PERIOD"] = trough_to_peak_period.values
        result_df["PEAK_TO_TROUGH_CYCLE_NUMBER"] = common_cycles
        result_df["TROUGH_TO_PEAK_CYCLE_NUMBER"] = trough_to_peak_cycle_numbers
        # Add track_id and position
        result_df["TRACK_ID"] = [track_id] * len(result_df)
        result_df["POSITION"] = [position] * len(result_df)
        # Concatenate data
        partial_periods = pd.concat([partial_periods, result_df], ignore_index=True)

## Add temperature information

In [7]:
temperature = all_data.groupby(["TRACK_ID", "POSITION"])["MEAN_TEMPERATURE"].mean().reset_index()
partial_periods = pd.merge(partial_periods, temperature, on=["TRACK_ID", "POSITION"], how="left")

# Clean outliers

In [8]:
# Remove any data points where the peak-to-trough or trough-to-peak period is negative
# These correspond to mislabelled cycle numbers most likely
negative_peak_to_trough = partial_periods[partial_periods["PEAK_TO_TROUGH_PERIOD"] < 0]
negative_trough_to_peak = partial_periods[partial_periods["TROUGH_TO_PEAK_PERIOD"] < 0]
partial_periods = partial_periods.drop(negative_peak_to_trough.index)
partial_periods = partial_periods.drop(negative_trough_to_peak.index)
partial_periods.reset_index(drop=True, inplace=True)

# Save result

In [9]:
partial_periods.to_csv(data_location + rf"\{date}_partial_periods.csv", index=False)

## Check result

In [None]:
partial_periods["PEAK_TO_TROUGH_CYCLE_NUMBER_STR"] = partial_periods["PEAK_TO_TROUGH_CYCLE_NUMBER"].astype(str)
px.scatter(partial_periods, x="MEAN_TEMPERATURE", y="PEAK_TO_TROUGH_PERIOD", color="PEAK_TO_TROUGH_CYCLE_NUMBER_STR")

In [None]:
partial_periods["TROUGH_TO_PEAK_CYCLE_NUMBER_STR"] = partial_periods["TROUGH_TO_PEAK_CYCLE_NUMBER"].astype(str)
px.scatter(partial_periods, x="MEAN_TEMPERATURE", y="TROUGH_TO_PEAK_PERIOD", color="TROUGH_TO_PEAK_CYCLE_NUMBER_STR")