In [16]:
import pandas as pd
import numpy as np

import json

In [17]:
# load data
df = pd.read_hdf('sat-data.h5')
df

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
0,41469,2024-03-29 23:14:42,42162.865,0.0005158,3.2631,122.0675,309.1062,234.7614
1,22787,2024-03-29 22:56:42,42164.272,0.0005063,11.6348,7.7743,18.0968,174.8006
2,22787,2024-03-29 22:56:42,42164.272,0.0005063,11.6348,7.7743,18.0968,174.8006
3,41469,2024-03-29 21:49:27,42162.856,0.0005158,3.263,122.0702,309.1057,213.3868
4,38704,2024-03-29 21:39:33,41632.950,0.0210129,2.2449,62.3939,214.7218,144.2019
...,...,...,...,...,...,...,...,...
5574,44479,2024-01-01 04:10:42,42163.982,0.0001534,0.0196,29.8835,262.5128,247.6115
5575,22787,2024-01-01 04:02:42,42164.504,0.0003773,11.6084,8.2266,273.8876,267.586
5576,22988,2024-01-01 02:59:17,42163.851,0.000334,16.2451,37.2131,254.7122,123.1842
5577,38699,2024-01-01 01:20:33,42002.568,0.0098973,1.4331,334.06,343.2008,183.0781


In [18]:
convert_dict = {
    "NORAD_CAT_ID": "object",
    "EPOCH": "datetime64",
    "SEMIMAJOR_AXIS": "float64",
    "ECCENTRICITY": "float64",
    "INCLINATION": "float64",
    "RA_OF_ASC_NODE": "float64",
    "ARG_OF_PERICENTER": "float64",
    "MEAN_ANOMALY": "float64"
}

# Convert columns using the dictionary
df = df.astype(convert_dict, errors='ignore')

# Print the data types of all columns
print(df.dtypes)

NORAD_CAT_ID                 object
EPOCH                datetime64[ns]
SEMIMAJOR_AXIS              float64
ECCENTRICITY                float64
INCLINATION                 float64
RA_OF_ASC_NODE              float64
ARG_OF_PERICENTER           float64
MEAN_ANOMALY                float64
dtype: object


In [19]:
df[df['NORAD_CAT_ID'] == '8832']

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
8,8832,2024-03-29 21:34:21,42738.661,0.013807,20.1599,52.9431,74.0489,287.1113
43,8832,2024-03-29 09:34:28,42738.651,0.013807,20.1588,52.9495,74.0388,110.2720
44,8832,2024-03-29 09:34:28,42738.651,0.013807,20.1588,52.9495,74.0388,110.2720
69,8832,2024-03-28 21:32:56,42738.632,0.013805,20.1577,52.9558,74.0287,293.0323
105,8832,2024-03-28 08:33:27,42738.600,0.013805,20.1565,52.9626,74.0186,101.5567
...,...,...,...,...,...,...,...,...
5469,8832,2024-01-03 09:45:43,42737.785,0.013755,19.9694,54.0533,72.5120,291.8799
5470,8832,2024-01-03 09:45:43,42737.785,0.013755,19.9694,54.0533,72.5120,291.8799
5502,8832,2024-01-02 10:03:10,42737.837,0.013758,19.9672,54.0659,72.4907,302.4314
5562,8832,2024-01-01 08:37:28,42737.907,0.013760,19.9649,54.0796,72.4644,287.6459


In [20]:
def deduplicate_by_date(
    data: pd.DataFrame,
    group_by_id_col: str,
    date_col: str,
    sort_ascending: bool = False,
) -> pd.DataFrame:
    """
    Deduplicates data by keeping the most recent entry for each group based on date.

    Args:
        data: A pandas DataFrame containing the data.
        group_by_id_col: The column to group the data by (e.g., satellite ID).
        date_col: The column containing the date information for sorting and deduplication.
        sort_ascending: Whether to sort in ascending order (False for most recent first).

    Returns:
        A pandas DataFrame containing the deduplicated data (copy of the input).
    """

    # Operate on a copy to avoid modifying original data
    data_copy = data.copy()

    # Extract the date part from the date column (assuming format YYYY-MM-DD)
    data_copy['date'] = pd.to_datetime(data_copy[date_col]).dt.date

    return (
        data_copy.groupby([group_by_id_col, 'date'])
        .apply(lambda x: x.sort_values(date_col, ascending=sort_ascending).head(1))
        .drop(columns='date')
        .reset_index(drop=True)  # Restructure for consistency
    )

# Example usage
deduplicated_data = deduplicate_by_date(df.copy(), 'NORAD_CAT_ID', 'EPOCH', sort_ascending=False)


In [21]:
deduplicated_data[deduplicated_data['NORAD_CAT_ID'] == '8832']

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
2236,8832,2024-01-01 08:37:28,42737.907,0.013760,19.9649,54.0796,72.4644,287.6459
2237,8832,2024-01-02 10:03:10,42737.837,0.013758,19.9672,54.0659,72.4907,302.4314
2238,8832,2024-01-03 09:45:43,42737.785,0.013755,19.9694,54.0533,72.5120,291.8799
2239,8832,2024-01-04 11:02:49,42737.772,0.013752,19.9717,54.0397,72.5332,304.5577
2240,8832,2024-01-05 09:53:06,42737.781,0.013749,19.9738,54.0275,72.5494,281.1741
...,...,...,...,...,...,...,...,...
2310,8832,2024-03-25 19:46:07,42738.416,0.013796,20.1508,52.9946,73.9679,285.6073
2311,8832,2024-03-26 19:47:34,42738.505,0.013800,20.1531,52.9820,73.9858,279.6966
2312,8832,2024-03-27 08:31:42,42738.543,0.013802,20.1542,52.9753,74.0003,107.3985
2313,8832,2024-03-28 21:32:56,42738.632,0.013805,20.1577,52.9558,74.0287,293.0323


In [22]:
deduplicated_data

Unnamed: 0,NORAD_CAT_ID,EPOCH,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY
0,20253,2024-01-01 17:28:57,42166.644,0.000549,12.6863,356.5900,292.6967,199.4844
1,20253,2024-01-02 16:54:16,42166.784,0.000552,12.6860,356.5841,293.1786,191.2716
2,20253,2024-01-03 12:20:46,42166.901,0.000555,12.6857,356.5792,293.7078,123.1443
3,20253,2024-01-04 15:39:35,42164.496,0.000573,12.6853,356.5726,309.2955,158.3972
4,20253,2024-01-05 14:01:16,42162.399,0.000599,12.6849,356.5665,305.9920,138.0717
...,...,...,...,...,...,...,...,...
2310,8832,2024-03-25 19:46:07,42738.416,0.013796,20.1508,52.9946,73.9679,285.6073
2311,8832,2024-03-26 19:47:34,42738.505,0.013800,20.1531,52.9820,73.9858,279.6966
2312,8832,2024-03-27 08:31:42,42738.543,0.013802,20.1542,52.9753,74.0003,107.3985
2313,8832,2024-03-28 21:32:56,42738.632,0.013805,20.1577,52.9558,74.0287,293.0323


In [23]:
def reshape_by_ws(data: pd.DataFrame, w: int = 20, step: int = 1) -> pd.DataFrame:
  """
  Reshapes time series data into segments using the Windowing Sliding (WS) method.

  Args:
    data: A pandas DataFrame containing the time series data with columns:
      - NORAD_CAT_ID: The NORAD Catalog ID (string).
      - EPOCH: The epoch time (datetime).
      - SEMIMAJOR_AXIS, ECCENTRICITY, INCLINATION, RA_OF_ASC_NODE, ARG_OF_PERICENTER, MEAN_ANOMALY: Orbital elements (numerical).
    w: The slicing width (window size) of each segment.
    step: The slicing step length for the window sliding (default=1).

  Returns:
    A pandas DataFrame containing the reshaped data with segments as rows. Each segment
    contains columns for NORAD_CAT_ID, all orbital elements, and a segment ID.

  Raises:
    ValueError: If the slicing width (w) is less than 1.
  """

  if w < 1:
      raise ValueError("Slicing width (w) must be greater than or equal to 1")

  def segment_data(group):
    """Segments data for a single NORAD_CAT_ID group."""
    # Sort by epoch (ensure data is ordered chronologically)
    group = group.sort_values(by='EPOCH')

    n = len(group)
    # Calculate number of segments (s) using integer division
    s = n // w

    # Create list to store segments with segment ID
    segments = []
    for i in range(s):
      start_idx = i * step
      end_idx = min(start_idx + w, n)

      # Extract segment data and create segment ID
      segment_id = i
      segment = group[['NORAD_CAT_ID'] + list(group.columns[1:])].iloc[start_idx:end_idx]
      segment['segment_id'] = segment_id

      segments.append(segment)

    segment.groupby('segment_id')
    
    return pd.concat(segments)

  # 1. Group data by NORAD_CAT_ID
  grouped_data = data.groupby('NORAD_CAT_ID')
  # 1.1. Filter groups with less than window size (w) data points
  filtered_data = grouped_data.filter(lambda x: len(x) >= w)

  # 2. group the filtered data
  grouped_filtered_data = filtered_data.groupby('NORAD_CAT_ID')
  # 3. Apply segment_data function to each filtered group, grouping by segment ID
  reshaped_data = grouped_filtered_data.apply(segment_data).reset_index(drop=True)

  # drop segment_id column
  return reshaped_data

reshaped_data = reshape_by_ws(deduplicated_data, w=20, step=1)


In [24]:
reshaped_data.dtypes

NORAD_CAT_ID                 object
EPOCH                datetime64[ns]
SEMIMAJOR_AXIS              float64
ECCENTRICITY                float64
INCLINATION                 float64
RA_OF_ASC_NODE              float64
ARG_OF_PERICENTER           float64
MEAN_ANOMALY                float64
segment_id                    int64
dtype: object

In [25]:
def normalize_data(data: pd.DataFrame) -> pd.DataFrame:
  """
  Normalizes the segment_vector column in a DataFrame using z-score normalization.

  Args:
    data: A pandas DataFrame containing columns:
      - segment_id: The segment ID (string).
      - segment_vector: A vector containing the six orbital elements (numpy array).

  Returns:
    A pandas DataFrame with the same columns but the segment_vector normalized.
  """

  # Get the segment vectors
  segment_vectors = data[['SEMIMAJOR_AXIS', 'ECCENTRICITY', 'INCLINATION', 'RA_OF_ASC_NODE', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY']]

  # Normalize each vector using z-score (mean 0, standard deviation 1)
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  normalized_vectors = scaler.fit_transform(segment_vectors)
  normalized_vectors = pd.DataFrame(normalized_vectors, columns=segment_vectors.columns)
  print(normalized_vectors)

  # Replace the segment_vector column with the normalized vectors
  # data = data.join(normalized_vectors[normalized_vectors.columns.difference(data.columns)])
  data = pd.concat([normalized_vectors, data[data.columns.difference(normalized_vectors.columns)]], axis=1)


  return data

normalized_data = normalize_data(reshaped_data.copy())

      SEMIMAJOR_AXIS  ECCENTRICITY  INCLINATION  RA_OF_ASC_NODE  \
0           0.265107     -0.414681     1.536091        1.521150   
1           0.265474     -0.414472     1.536035        1.521102   
2           0.265780     -0.414277     1.535980        1.521063   
3           0.259482     -0.412929     1.535906        1.521010   
4           0.253990     -0.411084     1.535832        1.520962   
...              ...           ...          ...             ...   
2075        1.762742      0.535540     2.886073       -0.900753   
2076        1.762593      0.535489     2.886276       -0.900805   
2077        1.762105      0.535518     2.886903       -0.900966   
2078        1.761796      0.535489     2.887309       -0.901072   
2079        1.761532      0.535453     2.887714       -0.901176   

      ARG_OF_PERICENTER  MEAN_ANOMALY  
0              0.775620      0.193786  
1              0.781192      0.097033  
2              0.787310     -0.705556  
3              0.967533     -0.2902

In [26]:
def label_data(data: pd.DataFrame) -> pd.DataFrame:
  """
  Labels data segments with one-hot encoded maneuver categories based on NORAD_CAT_ID.

  Args:
    data: A pandas DataFrame containing columns:
      - NORAD_CAT_ID: The NORAD Catalog ID (string).
      - segment_id: The segment ID (string).
      - segment_vector: A vector containing the six normalized orbital elements (numpy array).

  Returns:
    A pandas DataFrame with additional columns:
      - maneuver_category: The maneuver category string (e.g., "EW").
      - maneuver_label: The one-hot encoded maneuver category (list of 3 integers).
  """
  # import json
  # Load the maneuver categories from a JSON file
  with open('category-map.json', 'r') as f:
    MANEUVER_CATEGORIES = json.load(f)
  cat_to_np = {
    "EW": np.array([1, 0, 0]),
    "EW-NS": np.array([0, 1, 0]),
    "NM": np.array([0, 0, 1]),
  }
  category_map = {}
  
  for key, value in MANEUVER_CATEGORIES.items():
    for cat_id in value:
      category_map[cat_id] = cat_to_np[key]
    
  # Assuming you have a function (or external data) to get maneuver categories by NORAD_CAT_ID
  def get_maneuver_category(cat_id):



    return category_map.get(cat_id, 'Unknown')

  data['maneuver_category'] = data['NORAD_CAT_ID'].apply(get_maneuver_category)

  return data

labeled_data = label_data(normalized_data.copy())

In [27]:
labeled_data

Unnamed: 0,SEMIMAJOR_AXIS,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPOCH,NORAD_CAT_ID,segment_id,maneuver_category
0,0.265107,-0.414681,1.536091,1.521150,0.775620,0.193786,2024-01-01 17:28:57,20253,0,"[1, 0, 0]"
1,0.265474,-0.414472,1.536035,1.521102,0.781192,0.097033,2024-01-02 16:54:16,20253,0,"[1, 0, 0]"
2,0.265780,-0.414277,1.535980,1.521063,0.787310,-0.705556,2024-01-03 12:20:46,20253,0,"[1, 0, 0]"
3,0.259482,-0.412929,1.535906,1.521010,0.967533,-0.290252,2024-01-04 15:39:35,20253,0,"[1, 0, 0]"
4,0.253990,-0.411084,1.535832,1.520962,0.929338,-0.529701,2024-01-05 14:01:16,20253,0,"[1, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...
2075,1.762742,0.535540,2.886073,-0.900753,-1.767476,1.184962,2024-01-20 16:25:26,8832,2,"[0, 0, 1]"
2076,1.762593,0.535489,2.886276,-0.900805,-1.767380,-0.967684,2024-01-21 04:27:06,8832,2,"[0, 0, 1]"
2077,1.762105,0.535518,2.886903,-0.900966,-1.766965,1.260792,2024-01-22 17:42:42,8832,2,"[0, 0, 1]"
2078,1.761796,0.535489,2.887309,-0.901072,-1.766726,1.275462,2024-01-23 18:13:17,8832,2,"[0, 0, 1]"


In [28]:
labeled_data[(labeled_data['NORAD_CAT_ID'] == '8832') & (labeled_data['segment_id'] == 0)].drop(columns=['NORAD_CAT_ID', 'segment_id', 'maneuver_category']).to_numpy()

array([[1.7611914653071163, 0.5376158029995421, 2.878419627563918,
        -0.8987623996582759, -1.7706739661808288, 1.232392139241306,
        Timestamp('2024-01-01 08:37:28')],
       [1.7610081418847534, 0.5374283898950263, 2.8788437965845404,
        -0.8988719919050209, -1.7703698894006352, 1.4065760902309579,
        Timestamp('2024-01-02 10:03:10')],
       [1.760871958771008, 0.5372265603978553, 2.8792495234738325,
        -0.89897278477429, -1.7701236218942424, 1.2822717407759305,
        Timestamp('2024-01-03 09:45:43')],
       [1.7608379129925524, 0.5370175227043569, 2.8796736924944555,
        -0.8990815770776281, -1.769878510573326, 1.4316254517668747,
        Timestamp('2024-01-04 11:02:49')],
       [1.7608614831468707, 0.5367940686182034, 2.880060977252416,
        -0.8991791701732699, -1.7696912085262102, 1.1561496250202905,
        Timestamp('2024-01-05 09:53:06')],
       [1.7609924284485476, 0.536585030924705, 2.8804667041417074,
        -0.8992863625897942, -1.769

In [29]:
fdf = labeled_data
fnp = []
fnpy = []
# fnp = np.array([])
id_list = fdf['NORAD_CAT_ID'].unique()


for id in id_list:
    fdf[fdf['NORAD_CAT_ID'] == id]
    segment_list = fdf[fdf['NORAD_CAT_ID'] == id]['segment_id'].unique()
    for segment in segment_list:
        fnp.append(fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)].drop(columns=['NORAD_CAT_ID', 'segment_id', 'maneuver_category', 'EPOCH']).to_numpy())
        fnpy.append(fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)]['maneuver_category'].to_numpy() )
        print(fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)]['maneuver_category'].to_numpy())
        
        
        # print(fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)]['maneuver_category'])
        # fnp = np.append(fnp, fdf[(fdf['NORAD_CAT_ID'] == id) & (fdf['segment_id'] == segment)].drop(columns=['NORAD_CAT_ID', 'segment_id', 'maneuver_category', 'EPOCH']).to_numpy())

# print(fnp)

[array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])]
[array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])]
[array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0]) array([1, 0, 0])
 array([1, 0, 0]) array([1, 0, 0

In [31]:
fnp = np.array(fnp)
np.save('sat-data.npy', fnp)
fnpy = np.array(fnpy)
np.save('sat-labels.npy', fnpy)
