In [1]:
import pandas as pd
import numpy as np

import json

In [2]:
# load data
df = pd.read_hdf('sat-data.h5')
df

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
0,41469,2024-03-29 23:14:42,42162.865,0.0005158,3.2631,122.0675,309.1062,234.7614
1,22787,2024-03-29 22:56:42,42164.272,0.0005063,11.6348,7.7743,18.0968,174.8006
2,22787,2024-03-29 22:56:42,42164.272,0.0005063,11.6348,7.7743,18.0968,174.8006
3,41469,2024-03-29 21:49:27,42162.856,0.0005158,3.263,122.0702,309.1057,213.3868
4,38704,2024-03-29 21:39:33,41632.950,0.0210129,2.2449,62.3939,214.7218,144.2019
...,...,...,...,...,...,...,...,...
1551,33436,2024-03-01 01:37:31,42165.891,0.0001378,0.0211,124.6008,281.0132,157.3407
1552,33373,2024-03-01 01:08:34,42164.289,0.0001905,0.007,141.2321,196.4168,116.7912
1553,44457,2024-03-01 00:49:42,42164.775,0.0000261,0.0133,127.4057,303.8112,112.5159
1554,22787,2024-03-01 00:41:42,42163.703,0.0003622,11.6257,7.9197,345.0296,205.4834


In [3]:
convert_dict = {
    "NORAD_CAT_ID": "object",
    "EPOCH": "datetime64",
    "SEMIMAJOR_AXIS": "float64",
    "ECCENTRICITY": "float64",
    "INCLINATION": "float64",
    "RA_OF_ASC_NODE": "float64",
    "ARG_OF_PERICENTER": "float64",
    "MEAN_ANOMALY": "float64"
}

# Convert columns using the dictionary
df = df.astype(convert_dict, errors='ignore')

# Print the data types of all columns
print(df.dtypes)

NORAD_CAT_ID                 object
EPOCH                datetime64[ns]
SEMIMAJOR_AXIS              float64
ECCENTRICITY                float64
INCLINATION                 float64
RA_OF_ASC_NODE              float64
ARG_OF_PERICENTER           float64
MEAN_ANOMALY                float64
dtype: object


In [4]:
df[df['NORAD_CAT_ID'] == '8832']

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
8,8832,2024-03-29 21:34:21,42738.661,0.013807,20.1599,52.9431,74.0489,287.1113
43,8832,2024-03-29 09:34:28,42738.651,0.013807,20.1588,52.9495,74.0388,110.272
44,8832,2024-03-29 09:34:28,42738.651,0.013807,20.1588,52.9495,74.0388,110.272
69,8832,2024-03-28 21:32:56,42738.632,0.013805,20.1577,52.9558,74.0287,293.0323
105,8832,2024-03-28 08:33:27,42738.6,0.013805,20.1565,52.9626,74.0186,101.5567
154,8832,2024-03-27 08:31:42,42738.543,0.013802,20.1542,52.9753,74.0003,107.3985
155,8832,2024-03-27 08:31:42,42738.543,0.013802,20.1542,52.9753,74.0003,107.3985
193,8832,2024-03-26 19:47:34,42738.505,0.0138,20.1531,52.982,73.9858,279.6966
218,8832,2024-03-26 08:16:12,42738.457,0.013799,20.152,52.988,73.9786,109.8658
242,8832,2024-03-25 19:46:07,42738.416,0.013796,20.1508,52.9946,73.9679,285.6073


In [5]:
def deduplicate_by_date(
    data: pd.DataFrame,
    group_by_id_col: str,
    date_col: str,
    sort_ascending: bool = False,
) -> pd.DataFrame:
    """
    Deduplicates data by keeping the most recent entry for each group based on date.

    Args:
        data: A pandas DataFrame containing the data.
        group_by_id_col: The column to group the data by (e.g., satellite ID).
        date_col: The column containing the date information for sorting and deduplication.
        sort_ascending: Whether to sort in ascending order (False for most recent first).

    Returns:
        A pandas DataFrame containing the deduplicated data (copy of the input).
    """

    # Operate on a copy to avoid modifying original data
    data_copy = data.copy()

    # Extract the date part from the date column (assuming format YYYY-MM-DD)
    data_copy['date'] = pd.to_datetime(data_copy[date_col]).dt.date

    return (
        data_copy.groupby([group_by_id_col, 'date'])
        .apply(lambda x: x.sort_values(date_col, ascending=sort_ascending).head(1))
        .drop(columns='date')
        .reset_index(drop=True)  # Restructure for consistency
    )

# Example usage
deduplicated_data = deduplicate_by_date(df.copy(), 'NORAD_CAT_ID', 'EPOCH', sort_ascending=False)


In [6]:
deduplicated_data[deduplicated_data['NORAD_CAT_ID'] == '8832']

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
678,8832,2024-03-01 21:29:42,42738.513,0.013743,20.0977,53.2991,73.4786,101.5024
679,8832,2024-03-02 22:41:13,42738.631,0.013744,20.1,53.2856,73.4944,112.8025
680,8832,2024-03-03 20:48:48,42738.713,0.013744,20.1021,53.2737,73.5083,78.9187
681,8832,2024-03-04 10:42:45,42738.744,0.013744,20.1034,53.2663,73.5184,283.7773
682,8832,2024-03-05 18:25:01,42738.779,0.013747,20.1063,53.2493,73.5321,31.0683
683,8832,2024-03-06 12:28:24,42738.766,0.013748,20.108,53.2397,73.5446,297.1962
684,8832,2024-03-07 12:30:00,42738.741,0.01375,20.1102,53.2269,73.5558,291.3244
685,8832,2024-03-08 12:31:25,42738.667,0.013753,20.1124,53.2142,73.5675,285.4088
686,8832,2024-03-09 12:32:58,42738.575,0.013755,20.1146,53.2015,73.5817,279.5215
687,8832,2024-03-10 00:34:34,42738.525,0.013756,20.1156,53.1951,73.5868,96.7832


In [7]:
deduplicated_data

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
0,20253,2024-03-01 17:36:02,42163.595,0.000675,12.6674,356.2155,349.1208,204.5277
1,20253,2024-03-02 16:55:42,42163.733,0.000677,12.6671,356.2095,349.4270,195.1141
2,20253,2024-03-03 19:50:04,42163.895,0.000681,12.6668,356.2024,350.1068,239.1494
3,20253,2024-03-04 16:39:42,42164.019,0.000682,12.6666,356.1970,350.5596,191.9705
4,20253,2024-03-05 04:48:42,42164.093,0.000683,12.6664,356.1939,350.8786,14.4059
...,...,...,...,...,...,...,...,...
697,8832,2024-03-25 19:46:07,42738.416,0.013796,20.1508,52.9946,73.9679,285.6073
698,8832,2024-03-26 19:47:34,42738.505,0.013800,20.1531,52.9820,73.9858,279.6966
699,8832,2024-03-27 08:31:42,42738.543,0.013802,20.1542,52.9753,74.0003,107.3985
700,8832,2024-03-28 21:32:56,42738.632,0.013805,20.1577,52.9558,74.0287,293.0323


In [8]:
def reshape_by_ws(data: pd.DataFrame, w: int = 20, step: int = 1) -> pd.DataFrame:
  """
  Reshapes time series data into segments using the Windowing Sliding (WS) method.

  Args:
    data: A pandas DataFrame containing the time series data with columns:
      - NORAD_CAT_ID: The NORAD Catalog ID (string).
      - EPOCH: The epoch time (datetime).
      - SEMIMAJOR_AXIS, ECCENTRICITY, INCLINATION, RA_OF_ASC_NODE, ARG_OF_PERICENTER, MEAN_ANOMALY: Orbital elements (numerical).
    w: The slicing width (window size) of each segment.
    step: The slicing step length for the window sliding (default=1).

  Returns:
    A pandas DataFrame containing the reshaped data with segments as rows. Each segment
    contains columns for NORAD_CAT_ID, all orbital elements, and a segment ID.

  Raises:
    ValueError: If the slicing width (w) is less than 1.
  """

  if w < 1:
      raise ValueError("Slicing width (w) must be greater than or equal to 1")

  def segment_data(group):
    """Segments data for a single NORAD_CAT_ID group."""
    # Sort by epoch (ensure data is ordered chronologically)
    group = group.sort_values(by='EPOCH')

    n = len(group)
    # Calculate number of segments (s) using integer division
    s = n // w

    # Create list to store segments with segment ID
    segments = []
    for i in range(s):
      start_idx = i * step
      end_idx = min(start_idx + w, n)

      # Extract segment data and create segment ID
      segment_id = i
      segment = group[['NORAD_CAT_ID'] + list(group.columns[1:])].iloc[start_idx:end_idx]
      segment['segment_id'] = segment_id

      segments.append(segment)

    segment.groupby('segment_id')
    
    return pd.concat(segments)

  # 1. Group data by NORAD_CAT_ID
  grouped_data = data.groupby('NORAD_CAT_ID')
  # 1.1. Filter groups with less than window size (w) data points
  filtered_data = grouped_data.filter(lambda x: len(x) >= w)

  # 2. group the filtered data
  grouped_filtered_data = filtered_data.groupby('NORAD_CAT_ID')
  # 3. Apply segment_data function to each filtered group, grouping by segment ID
  reshaped_data = grouped_filtered_data.apply(segment_data).reset_index(drop=True)

  # drop segment_id column
  return reshaped_data

reshaped_data = reshape_by_ws(deduplicated_data, w=5, step=1)


In [9]:
reshaped_data.dtypes

NORAD_CAT_ID                 object
EPOCH                datetime64[ns]
SEMIMAJOR_AXIS              float64
ECCENTRICITY                float64
INCLINATION                 float64
RA_OF_ASC_NODE              float64
ARG_OF_PERICENTER           float64
MEAN_ANOMALY                float64
segment_id                    int64
dtype: object

In [10]:
def normalize_data(data: pd.DataFrame) -> pd.DataFrame:
  """
  Normalizes the segment_vector column in a DataFrame using z-score normalization.

  Args:
    data: A pandas DataFrame containing columns:
      - segment_id: The segment ID (string).
      - segment_vector: A vector containing the six orbital elements (numpy array).

  Returns:
    A pandas DataFrame with the same columns but the segment_vector normalized.
  """

  # Get the segment vectors
  segment_vectors = data[['SEMIMAJOR_AXIS', 'ECCENTRICITY', 'INCLINATION', 'RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY']]

  # Normalize each vector using z-score (mean 0, standard deviation 1)
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  normalized_vectors = scaler.fit_transform(segment_vectors)
  normalized_vectors = pd.DataFrame(normalized_vectors, columns=segment_vectors.columns)
  print(normalized_vectors)

  # Replace the segment_vector column with the normalized vectors
  # data = data.join(normalized_vectors[normalized_vectors.columns.difference(data.columns)])
  data = pd.concat([normalized_vectors, data[data.columns.difference(normalized_vectors.columns)]], axis=1)


  return data

normalized_data = normalize_data(reshaped_data.copy())

     SEMIMAJOR_AXIS  ECCENTRICITY  INCLINATION  RA_OF_ASC_NODE  \
0          0.252410     -0.410347     1.529528        1.503390   
1          0.252763     -0.410206     1.529473        1.503340   
2          0.253177     -0.410010     1.529418        1.503281   
3          0.253493     -0.409912     1.529382        1.503237   
4          0.253682     -0.409863     1.529345        1.503211   
..              ...           ...          ...             ...   
630        1.721118      0.389131     2.889564       -0.999866   
631        1.721207      0.389290     2.890094       -1.000006   
632        1.721174      0.389339     2.890405       -1.000086   
633        1.721110      0.389498     2.890808       -1.000191   
634        1.720921      0.389645     2.891210       -1.000296   

     ARG_OF_PERICENTER  MEAN_ANOMALY  
0             1.301334      0.377643  
1             1.303986      0.279845  
2             1.309872      0.737328  
3             1.313793      0.247186  
4           

In [11]:
def label_data(data: pd.DataFrame) -> pd.DataFrame:
  """
  Labels data segments with one-hot encoded maneuver categories based on NORAD_CAT_ID.

  Args:
    data: A pandas DataFrame containing columns:
      - NORAD_CAT_ID: The NORAD Catalog ID (string).
      - segment_id: The segment ID (string).
      - segment_vector: A vector containing the six normalized orbital elements (numpy array).

  Returns:
    A pandas DataFrame with additional columns:
      - maneuver_category: The maneuver category string (e.g., "EW").
      - maneuver_label: The one-hot encoded maneuver category (list of 3 integers).
  """
  # import json
  # Load the maneuver categories from a JSON file
  with open('category-map.json', 'r') as f:
    MANEUVER_CATEGORIES = json.load(f)

  category_map = {}
  for key, value in MANEUVER_CATEGORIES.items():
    for cat_id in value:
      category_map[cat_id] = key
    
  # Assuming you have a function (or external data) to get maneuver categories by NORAD_CAT_ID
  def get_maneuver_category(cat_id):
    return category_map.get(cat_id, 'Unknown')

  data['maneuver_category'] = data['NORAD_CAT_ID'].apply(get_maneuver_category)

  return data

labeled_data = label_data(normalized_data.copy())

In [38]:
labeled_data

Unnamed: 0,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPOCH,NORAD_CAT_ID,segment_id,maneuver_category
0,0.252410,-0.410347,1.529528,1.503390,1.301334,0.377643,2024-03-01 17:36:02,20253,0,EW
1,0.252763,-0.410206,1.529473,1.503340,1.303986,0.279845,2024-03-02 16:55:42,20253,0,EW
2,0.253177,-0.410010,1.529418,1.503281,1.309872,0.737328,2024-03-03 19:50:04,20253,0,EW
3,0.253493,-0.409912,1.529382,1.503237,1.313793,0.247186,2024-03-04 16:39:42,20253,0,EW
4,0.253682,-0.409863,1.529345,1.503211,1.316556,-1.597536,2024-03-05 04:48:42,20253,0,EW
...,...,...,...,...,...,...,...,...,...,...
630,1.721118,0.389131,2.889564,-0.999866,-1.085180,1.200968,2024-03-04 10:42:45,8832,3,NM
631,1.721207,0.389290,2.890094,-1.000006,-1.085061,-1.424430,2024-03-05 18:25:01,8832,3,NM
632,1.721174,0.389339,2.890405,-1.000086,-1.084953,1.340378,2024-03-06 12:28:24,8832,3,NM
633,1.721110,0.389498,2.890808,-1.000191,-1.084856,1.279375,2024-03-07 12:30:00,8832,3,NM


In [89]:
labeled_data[(labeled_data['NORAD_CAT_ID'] == '8832') & (labeled_data['segment_id'] == 0)].drop(columns=['NORAD_CAT_ID', 'segment_id', 'maneuver_category']).to_numpy()

array([[1.720527930744215, 0.3890636837445691, 2.8885213488367287,
        -0.9995948614526118, -1.0855245206809843, -0.6926888840264416,
        Timestamp('2024-03-01 21:29:42')],
       [1.7208292569566974, 0.3891187395595195, 2.8889420161306156,
        -0.9997064113420899, -1.0853877042955506, -0.5752919423429591,
        Timestamp('2024-03-02 22:41:13')],
       [1.7210386531382547, 0.3891493261233808, 2.889326103659816,
        -0.9998047405039262, -1.0852673405134285, -0.9273113485973092,
        Timestamp('2024-03-03 20:48:48')],
       [1.7211178151093174, 0.38913097418506404, 2.8895638721302737,
        -0.9998658863692697, -1.08517988193793, 1.200968323048062,
        Timestamp('2024-03-04 10:42:45')],
       [1.7212071915282814, 0.38929002431714294, 2.890094278718218,
        -1.0000063566004642, -1.085061250008788, -1.4244300128862366,
        Timestamp('2024-03-05 18:25:01')]], dtype=object)

In [115]:
fdf = labeled_data
fnp = []
fnpy = []
# fnp = np.array([])
id_list = fdf['NORAD_CAT_ID'].unique()


for id in id_list:
    fdf[fdf['NORAD_CAT_ID'] == id]
    segment_list = fdf[fdf['NORAD_CAT_ID'] == id]['segment_id'].unique()
    for segment in segment_list:
        fnp.append(fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)].drop(columns=['NORAD_CAT_ID', 'segment_id', 'maneuver_category', 'EPOCH']).to_numpy())
        fnpy.append(fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)]['maneuver_category'].to_numpy())
        # fnp = np.append(fnp, fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)].drop(columns=['NORAD_CAT_ID', 'segment_id', 'maneuver_category', 'EPOCH']).to_numpy())

# print(fnp)

In [119]:
fnp = np.array(fnp)
np.save('sat_data.npy', fnp)
fnpy = np.array(fnpy)
np.save('sat_labels.npy', fnpy)
