In [1]:
from astropy.coordinates.angle_utilities import angular_separation
from astropy.coordinates import SkyCoord

from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
import astropy.units as u

In [2]:
def linear_interpolate_with_gaps(df, time_column, position_columns, time_threshold):
    """
    Linearly interpolate the position data in a DataFrame, handling gaps based on the previous segment's linear spacing.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing time and position data.
    time_column (str): The column name for time.
    position_columns (list): The list of column names for position (e.g., ['position_x', 'position_y', 'position_z']).
    gap_threshold (float): The threshold to identify gaps in time data.
    
    Returns:
    pd.DataFrame: DataFrame with interpolated position data.
    """
    
    # Sort the DataFrame by time
    df = df.sort_values(by=time_column).reset_index(drop=True)
    
    # Calculate time differences between consecutive rows
    df['time_diff'] = df[time_column].diff().fillna(pd.Timedelta(seconds=0))
    
    # Identify groups based on time threshold
    df['group'] = (df['time_diff'] > pd.Timedelta(seconds=time_threshold)).cumsum()
    
    # List to store interpolated DataFrames
    interpolated_dfs = []
    
    # Iterate over each group
    for group, group_df in df.groupby('group'):
        # Interpolate each position column within the group
        for pos_col in position_columns:
            group_df[pos_col] = group_df[pos_col].interpolate()
        
        # Append the interpolated DataFrame for the current group
        interpolated_dfs.append(group_df)
    
    # Concatenate all interpolated DataFrames
    interpolated_df = pd.concat(interpolated_dfs).reset_index(drop=True)
    
    # Drop the helper columns
    interpolated_df = interpolated_df.drop(columns=['time_diff', 'group'])
    
    # Replace NaN values with 0
    interpolated_df = interpolated_df.fillna(0)
    
    return interpolated_df

In [3]:
# Load the file to inspect its contents
directory = r'C:\Users\adamf\Documents\PhD\Diffraction\RV-DAT\\'#the first coarse sample wasnt read in!
csv_files = glob.glob(os.path.join(directory, '*.csv'))
rawData_list = [pd.read_csv(file) for file in csv_files]
rawData = pd.concat(rawData_list, ignore_index=True)
data = rawData
# Forward fill missing values to ensure each timestamp has the most recent value for each position
#posCols = ['position_x','position_y','position_z','earth_unit_vector_x','earth_unit_vector_y','earth_unit_vector_z','right_ascension','declination']
#for posCol in posCols:
    #data[posCol] = rawData[posCol].ffill()
#instead of forward fill I'm going to linearly interpolate
data['time'] = pd.to_datetime(data['time'])
#data = linear_interpolate_with_gaps(data,'time',['position_x','position_y','position_z'],30)

In [4]:
data = data[data['frequency_band']!=0]
data.sort_values('time',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.sort_values('time',inplace=True)


In [5]:
dataFill = data.copy()

In [6]:
dataFill[['position_x', 'position_y', 'position_z','earth_unit_vector_x','earth_unit_vector_y',
     'earth_unit_vector_z','right_ascension'
    ,'declination']]=data[['position_x', 'position_y', 'position_z','earth_unit_vector_x','earth_unit_vector_y',
     'earth_unit_vector_z','right_ascension'
    ,'declination']].fillna(0)

In [7]:
print(len(data))
bounds = {
    'position_x': {'lower': -4000, 'upper': 4000},
    'position_y': {'lower': -4000, 'upper': 4000},
    'position_z': {'lower': -3000, 'upper': 3000}
}
mask = (dataFill['position_x'] >= bounds['position_x']['lower']) & (dataFill['position_x'] <= bounds['position_x']['upper']) & \
       (dataFill['position_y'] >= bounds['position_y']['lower']) & (dataFill['position_y'] <= bounds['position_y']['upper']) & \
       (dataFill['position_z'] >= bounds['position_z']['lower']) & (dataFill['position_z'] <= bounds['position_z']['upper'])
data = data[mask]
print(len(data))

16006688
16006684


In [8]:
interpolated_df = linear_interpolate_with_gaps(data, 'time', ['position_x', 'position_y', 'position_z','earth_unit_vector_x'
                                                              ,'earth_unit_vector_y','earth_unit_vector_z','right_ascension'
                                                             ,'declination'], 120)

In [10]:
print(len(interpolated_df))
df_Clipped = pd.DataFrame()
df_Clipped2 = pd.DataFrame()
mask = interpolated_df['rv1_coarse']<1e10
df_Limited = interpolated_df[mask]
mask = df_Limited['rv1_coarse']!=0
df_Limited = df_Limited[mask]
mode = []
modeCount = []
for freq in df_Limited['frequency_band'].unique():
    freq_df = df_Limited[df_Limited['frequency_band'] == freq]
    
    # Calculate mean and standard deviation for rv1_coarse
    mean_rv1 = freq_df['rv1_coarse'].mean()
    std_rv1 = freq_df['rv1_coarse'].std()
    # Filter out values outside 4 standard deviations from the mean
    filtered_freq_df = freq_df[(freq_df['rv1_coarse'] >= mean_rv1 - 3 * std_rv1) & (freq_df['rv1_coarse'] <= mean_rv1 + 3 * std_rv1)]
    
    # Append the filtered data to the final DataFrame
    df_Clipped = pd.concat([df_Clipped, filtered_freq_df])
print(len(df_Clipped))
for freq in df_Limited['frequency_band'].unique():
    freq_df = df_Clipped[df_Clipped['frequency_band'] == freq]
    
    # Calculate mean and standard deviation for rv1_coarse
    mean_rv1 = freq_df['rv1_coarse'].mean()
    std_rv1 = freq_df['rv1_coarse'].std()
    # Filter out values outside 4 standard deviations from the mean
    filtered_freq_df = freq_df[(freq_df['rv1_coarse'] >= mean_rv1 - 3 * std_rv1) & (freq_df['rv1_coarse'] <= mean_rv1 + 3 * std_rv1)]
    
    # Append the filtered data to the final DataFrame
    df_Clipped2 = pd.concat([df_Clipped2, filtered_freq_df])
#df_orbitsClipped.head()
df_Clipped2.sort_values('time',inplace=True)
print(len(df_Clipped2))

16006684
13601034
13423395


In [12]:
df_Clipped2.head(15)

Unnamed: 0,time,frequency_band,position_x,position_y,position_z,earth_unit_vector_x,earth_unit_vector_y,earth_unit_vector_z,right_ascension,declination,rv1_coarse,rv2_coarse,rv1_fine,rv2_fine,rv_temp
1,1973-07-12 00:00:12.304,1,2545.709503,762.504211,-893.179993,0.117721,0.901642,0.416154,0.860565,-10.151745,4044755.0,246994100.0,0.0,0.0,1092343562
2,1973-07-12 00:00:14.229,1,2544.656067,764.381226,-894.525269,0.117716,0.901642,0.416153,0.863424,-10.181622,4197954.0,324162600.0,0.0,0.0,1092343562
3,1973-07-12 00:00:16.154,1,2543.602631,766.25824,-895.870544,0.117712,0.901643,0.416153,0.866283,-10.211499,4120635.0,246994100.0,0.0,0.0,1092343562
4,1973-07-12 00:00:18.079,1,2542.549194,768.135254,-897.21582,0.117707,0.901644,0.416153,0.869142,-10.241377,4120635.0,324162600.0,1475682000000.0,370742528.0,1092343562
5,1973-07-12 00:00:20.004,1,2541.495758,770.012268,-898.561096,0.117702,0.901644,0.416153,0.872001,-10.271254,4197954.0,558899200.0,0.0,0.0,1092343562
6,1973-07-12 00:00:21.929,1,2540.442322,771.889282,-899.906372,0.117698,0.901645,0.416153,0.87486,-10.301132,4197954.0,372705000.0,0.0,0.0,1092343562
7,1973-07-12 00:00:23.854,1,2539.388885,773.766296,-901.251648,0.117693,0.901646,0.416153,0.877719,-10.331009,4276719.0,431903500.0,0.0,0.0,1092343562
9,1973-07-12 00:00:27.663,9,2537.282013,777.520355,-903.942169,0.117684,0.901647,0.416152,0.883437,-10.390764,257156.1,145438.2,0.0,0.0,1092343562
10,1973-07-12 00:00:29.588,9,2536.228577,779.3974,-905.287415,0.11768,0.901648,0.416152,0.886296,-10.420641,239034.5,145438.2,0.0,0.0,1092343562
11,1973-07-12 00:00:31.513,9,2535.17514,781.274445,-906.63266,0.117675,0.901648,0.416152,0.889155,-10.450518,223919.4,183599.7,0.0,0.0,1092343562


In [13]:
df_Clipped2.to_csv('cleanedRAE2MasterFile.csv')

In [12]:
print(mode)
print(modeCount)

[0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64, 0    0.0
dtype: float64]
[1, 1, 1, 1, 1, 1, 1, 1, 1]


In [17]:
frequencies = df_Clipped['frequency_band'].unique()

In [20]:
%matplotlib qt
fig, axes = plt.subplots(len(frequencies), 1, figsize=(10, 5 * len(frequencies)), sharex=True)
for ax, freq in zip(axes, frequencies):
    freq_df = df_Clipped[df_Clipped['frequency_band'] == freq]
    
    ax.scatter(freq_df['time'], freq_df['rv1_coarse'], marker='o', linestyle='-')
    ax.set_title(f'Frequency: {freq}')
    ax.set_xlabel('Time since start of period (s)')
    
    #ax.set_xlabel('Time Difference modulo orbital period (seconds)')
    ax.set_ylabel('rv1_coarse')
    ax.grid(True)
    #zero_window_diff_rows = freq_df[freq_df['window_diff'] == 0.0]
    #for _, row in zero_window_diff_rows.iterrows():
        #ax.axvline(x=row['angularSepFornaxA'], color='r', linestyle='--', linewidth=1)
#     diffraction_true_rows = freq_df[freq_df['diffraction'] == True]
#     for _, row in diffraction_true_rows.iterrows():
#         first_time = row['first_time_in_group']
#         color = color_map[first_time]
#         ax.axvspan(row['period_diff'] - 2, row['period_diff'] + 2, color=color, alpha=0.1)
    #avgFreq = average_values.loc[average_values['frequency_band'] == freq, 'avg_rv1_coarse'].values[0]
    #ax.axhline(y=avgFreq,linewidth=2)
    #ax.grid(True)
    #ax.set_yscale('log')
plt.tight_layout()
plt.subplots_adjust(hspace=0.4)
