### Part 3 of OCB Modeling
Here we are going to create the ultimate unified dataset where the indices resolution is 1 min/5min range, and all the dayside and nightside files are added together into a singular dataframe for analysis. 
- Author: Arnav Singh

In [2]:
"""
Imports for Data Preprocessing
"""
import pandas as pd
import os
from datetime import datetime

### Data Extraction
Older Nightside data from 1983 to 2009, provided by Tom Sotirelis JHU APL

In [3]:
df = pd.read_csv('data/allOCB_and_b2i.csv')

#Renaming the columns real quick
df = df.rename(columns={'Date1(UTC)': 'Date(UTC)_OCB', 'MagLat1' : 'MagLat_OCB', 'Date2(UTC)': 'Date(UTC)_B2I', 'MagLat2' : 'MagLat_B2I', 'MLT1' : 'MLT_OCB', 'MLT2' : 'MLT_B2I'})
# Shifting the columns over
col = df.pop('MLT_OCB')
df.insert(2, 'MLT_OCB', col)
df['datetime_str'] = df['Date(UTC)_OCB']
df['Date_UTC'] = pd.to_datetime(df['datetime_str'], format='%Y-%m-%d %H:%M:%S')
df = df.drop(columns=['Date(UTC)_OCB', 'datetime_str', 'Date(UTC)_B2I'])
df.set_index('Date_UTC', inplace=True)
print(df.shape)
df.head()

(399528, 4)


Unnamed: 0_level_0,MagLat_OCB,MLT_OCB,MagLat_B2I,MLT_B2I
Date_UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1983-12-01 00:30:42,-78.71,18.37,-67.17,18.26
1983-12-01 00:38:35,-75.03,5.9,-69.69,5.97
1983-12-01 01:20:21,78.5,4.38,67.15,5.15
1983-12-01 01:28:52,73.51,18.94,65.74,18.58
1983-12-01 02:10:01,-75.96,19.53,-65.79,18.99


Data from OMNI 1HR Index: Trying to get 1min/5min resolution data

In [4]:
# Read the data into a DataFrame
df3 = pd.read_csv('OMNI2_H0_MRG1HR_729923.txt', 
                 sep='\s+',  # Use whitespace as separator
                 comment='#',  # Skip rows that start with #
                 low_memory=False,
                 names=['Date','TIME_AT_CENTER_OF_HOUR', '1AU_IP_BX,_GSE', '1AU_IP_BY,_GSE', '1AU_IP_BZ,_GSE', '1AU_IP_PLASMA_SPEED', '3-H_KP*10', '1-H_DST', '1-H_AE', '1-H_AL-INDEX', 'AU-INDEX', 'PROTON_QI'])

df3 = df3.drop(index=[0,1])
# Combine date and time columns into a single string
df3['datetime_str'] = df3['Date'] + ' ' + df3['TIME_AT_CENTER_OF_HOUR']

# Convert the string to a datetime object
df3['Date_UTC'] = pd.to_datetime(df3['datetime_str'], format='%d-%m-%Y %H:%M:%S.%f')

# Drop the original columns
df3 = df3.drop(columns=['Date', 'TIME_AT_CENTER_OF_HOUR', 'datetime_str'])
df3.set_index('Date_UTC', inplace=True)
print(df3.shape)
df3.head()

(228671, 10)


Unnamed: 0_level_0,"1AU_IP_BX,_GSE","1AU_IP_BY,_GSE","1AU_IP_BZ,_GSE",1AU_IP_PLASMA_SPEED,3-H_KP*10,1-H_DST,1-H_AE,1-H_AL-INDEX,AU-INDEX,PROTON_QI
Date_UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1983-12-01 00:30:00,999.9,999.9,999.9,9999.0,33,-17,206,-126,79,9.9999
1983-12-01 01:30:00,999.9,999.9,999.9,9999.0,33,-19,309,-178,130,9.9999
1983-12-01 02:30:00,999.9,999.9,999.9,9999.0,33,-21,293,-139,154,9.9999
1983-12-01 03:30:00,999.9,999.9,999.9,9999.0,33,-20,205,-82,121,9.9999
1983-12-01 04:30:00,999.9,999.9,999.9,9999.0,33,-16,187,-89,97,9.9999


Dayside data from 1983-2012. Provided by Tom Sotirelis (JHU APL)

In [5]:
# Define folder paths
folder_paths = ["dayside"]

# Function to convert date columns to datetime
def convert_to_datetime(df, year):
    date_columns = ['EQB_Date', 'SUB_Date', 'OCB_Date']
    for col in date_columns:
        df[col] = pd.to_timedelta(df[col], unit='s') + pd.Timestamp(year)
    return df

# List to accumulate DataFrames
all_dataframes = []

for folder_path in folder_paths:
    files = os.listdir(folder_path)
    
    for file_name in files:
        if file_name.startswith("d_") and file_name.endswith(".txt"):
            year_str = file_name.split("_")[1].split(".")[0]
            
            if year_str.isdigit():
                year = datetime(int(year_str), 1, 1)
                
                # Read the file using whitespace as a delimiter
                file_path = os.path.join(folder_path, file_name)
                dayside_df = pd.read_csv(
                    file_path,  # Correct file path usage
                    delim_whitespace=True,  # Use any whitespace as a delimiter
                    low_memory=False,
                    names=['EQB_Date', 'EQB_glat', 'EQB_glon', 'EQB_mlat', 'EQB_mlon', 'EQB_mlt', 'sc1', 
                        'SUB_Date', 'SUB_glat', 'SUB_glon', 'SUB_mlat', 'SUB_mlon', 'SUB_mlt', 'sc2',
                        'OCB_Date', 'OCB_glat', 'OCB_glon', 'OCB_mlat', 'OCB_mlon', 'OCB_mlt', 'sc3'
                    ]
                )

                # Convert the date columns to datetime
                dayside_df = convert_to_datetime(dayside_df, year)

                # Append the DataFrame to the list
                all_dataframes.append(dayside_df)

# Concatenate all DataFrames into a single DataFrame
combined_dayside_df = pd.concat(all_dataframes, ignore_index=True)

combined_dayside_df.sort_values(by = 'EQB_Date', inplace = True)
combined_dayside_df['Dayside']=1
print(combined_dayside_df.shape)
combined_dayside_df.describe()

(631219, 22)


Unnamed: 0,EQB_Date,EQB_glat,EQB_glon,EQB_mlat,EQB_mlon,EQB_mlt,sc1,SUB_Date,SUB_glat,SUB_glon,...,SUB_mlt,sc2,OCB_Date,OCB_glat,OCB_glon,OCB_mlat,OCB_mlon,OCB_mlt,sc3,Dayside
count,631219,631219.0,631219.0,631219.0,631219.0,631219.0,631219.0,631219,631219.0,631219.0,...,631219.0,631219.0,631219,631219.0,631219.0,631219.0,631219.0,631219.0,631219.0,631219.0
mean,2000-09-22 06:25:29.615132288,5.745049,118.062761,5.224109,-8.692862,7.560731,12.902218,2000-10-03 20:16:05.524951040,4.716075,125.773612,...,8.438406,12.902218,2000-10-31 11:45:18.831916928,5.003322,158.737878,4.075263,-4.473903,10.377233,12.902218,1.0
min,1982-12-31 23:59:59,-81.45,0.0,-87.92,-180.0,0.0,6.0,1982-12-31 23:59:59,-81.52,0.0,...,0.0,6.0,1982-12-31 23:59:59,-81.52,0.0,-89.83,-180.0,0.0,6.0,1.0
25%,1994-12-31 23:59:59,-51.58,0.0,-62.07,-66.35,0.0,11.0,1994-12-31 23:59:59,-60.3,0.0,...,0.0,11.0,1995-01-05 20:57:25,-71.84,77.66,-77.17,-81.54,6.46,11.0,1.0
50%,2001-12-31 23:59:59,0.0,106.73,0.0,0.0,7.16,13.0,2001-12-31 23:59:59,0.0,123.52,...,7.91,13.0,2001-12-31 23:59:59,0.0,163.32,0.0,0.0,9.65,13.0,1.0
75%,2006-12-31 23:59:59,62.03,210.64,65.06,30.13,10.42,15.0,2007-02-04 22:26:03.500000,68.53,210.37,...,13.6,15.0,2007-04-09 23:05:38,76.22,232.54,77.43,69.38,15.96,15.0,1.0
max,2012-12-31 23:27:06,81.52,360.0,87.38,180.0,23.91,18.0,2012-12-31 23:24:08,81.52,360.0,...,23.91,18.0,2012-12-31 23:22:24,81.52,360.0,89.9,180.0,23.97,18.0,1.0
std,,52.681822,111.279379,54.521206,86.603911,6.592322,3.204782,,60.011571,108.036961,...,6.500874,3.204782,,70.502675,101.655735,73.572601,92.370944,5.757561,3.204782,0.0


In [6]:
ocb_dayside_df = combined_dayside_df[['OCB_Date', 'OCB_mlat','OCB_mlt']].copy()
print(ocb_dayside_df.shape)
ocb_dayside_df = ocb_dayside_df[~(
    (ocb_dayside_df['OCB_Date'].dt.time == pd.Timestamp('23:59:59').time()) &
    (ocb_dayside_df['OCB_Date'].dt.month == 12) &
    (ocb_dayside_df['OCB_Date'].dt.day == 31)
)] # Removes all NaN values
print(ocb_dayside_df.shape)
ocb_dayside_df.describe()

(631219, 3)
(567277, 3)


Unnamed: 0,OCB_Date,OCB_mlat,OCB_mlt
count,567277,567277.0,567277.0
mean,2000-09-30 19:48:15.702638976,4.534616,11.546928
min,1983-12-01 00:38:06,-89.83,0.03
25%,1994-11-01 20:32:28,-77.73,7.54
50%,2001-11-19 10:43:16,71.78,10.29
75%,2007-04-13 15:39:54,77.92,16.46
max,2012-12-31 23:22:24,89.9,23.97
std,,77.594954,4.835249


New nightside data 

In [7]:
'''
Extracting the new nightside data for analysis
'''
folder_paths = ["nightside"]

# Function to convert the first column to datetime
def convert_to_datetime(df, year):
    df['date'] = pd.to_timedelta(df['date'], unit='s') + pd.Timestamp(year)
    return df

# List to accumulate DataFrames
all_dataframes = []

for folder_path in folder_paths:
    files = os.listdir(folder_path)
    
    for file_name in files:
        if file_name.startswith("n_") and file_name.endswith(".txt"):
            year_str = file_name.split("_")[1].split(".")[0]
            
            if year_str.isdigit():
                year = datetime(int(year_str), 1, 1)
                
                # Read the file using whitespace as a delimiter
                file_path = os.path.join(folder_path, file_name)
                nightside_df = pd.read_csv(
                    file_path,  # Correct file path usage
                    delim_whitespace=True,  # Use any whitespace as a delimiter
                    low_memory=False,
                    names=[
                        'date1', 'geo_lat1', 'geo_long1', 'mag_lat1', 'mag_long1', 'mlt1', 'index1',
                        'date2', 'geo_lat2', 'geo_long2', 'mag_lat2', 'mag_long2', 'mlt2', 'index2',
                        'date3', 'geo_lat3', 'geo_long3', 'mag_lat3', 'mag_long3', 'mlt3', 'index3',
                        'date4', 'geo_lat4', 'geo_long4', 'mag_lat4', 'mag_long4', 'mlt4', 'index4',
                        'date5', 'geo_lat5', 'geo_long5', 'mag_lat5', 'mag_long5', 'mlt5', 'index5',
                        'date6', 'geo_lat6', 'geo_long6', 'mag_lat6', 'mag_long6', 'mlt6', 'index6',
                        'date7', 'geo_lat7', 'geo_long7', 'mag_lat7', 'mag_long7', 'mlt7', 'index7'
                    ]
                )

                # Convert the first date columns to datetime
                for i in range(1, 8):
                    nightside_df[f'date{i}'] = pd.to_timedelta(nightside_df[f'date{i}'], unit='s') + pd.Timestamp(year)
                
                # Append the DataFrame to the list
                all_dataframes.append(nightside_df)

# Concatenate all DataFrames into a single DataFrame
combined_nightside_df = pd.concat(all_dataframes, ignore_index=True)

# Example: Sort by the first date column
combined_nightside_df.sort_values(by='date1', inplace=True)

# Show the DataFrame summary
combined_nightside_df.head()

Unnamed: 0,date1,geo_lat1,geo_long1,mag_lat1,mag_long1,mlt1,index1,date2,geo_lat2,geo_long2,...,mag_long6,mlt6,index6,date7,geo_lat7,geo_long7,mag_lat7,mag_long7,mlt7,index7
76599,2009-01-01 00:41:23,-53.07,93.34,-67.03,150.1,4.85,16,2009-01-01 00:41:23,-53.07,93.34,...,134.33,3.8,16,2009-01-01 00:43:52,-61.48,88.12,-73.45,132.67,3.7,16
76600,2009-01-01 00:53:52,-78.21,328.96,-66.19,25.48,20.55,16,2009-01-01 00:53:52,-78.21,328.96,...,37.72,21.36,16,2009-01-01 00:49:34,-78.76,52.78,-75.67,57.7,22.69,16
76601,2009-01-01 01:34:38,60.76,255.92,70.46,-42.91,17.02,16,2009-01-01 01:34:55,61.71,255.19,...,-41.54,17.11,16,2009-01-01 01:38:11,72.29,242.28,78.38,-72.46,15.04,16
76602,2009-01-01 02:23:45,-54.51,67.1,-64.08,116.44,4.73,16,2009-01-01 02:23:45,-54.51,67.1,...,64.47,1.25,16,2009-01-01 02:30:38,-76.59,37.92,-72.47,58.14,0.84,16
76603,2009-01-01 02:35:42,-78.47,304.97,-65.57,15.28,21.98,16,2009-01-01 02:35:42,-78.47,304.97,...,37.18,23.44,16,2009-01-01 02:33:29,-81.31,348.41,-69.79,30.86,23.02,16


Selecting for OCB Boundary and B2I, which in this case is b6 (7) and b2i (3)

In [23]:
new_nightside_df = combined_nightside_df[['date7','mag_lat7','mlt7','mag_lat3','mlt3']].copy()
new_nightside_df['date7'] = pd.to_datetime(new_nightside_df['date7'], format='%Y-%m-%d %H:%M:%S')
new_nightside_df.head()
new_nightside_df.set_index('date7', inplace=True)
new_nightside_df.columns = df.columns
print(new_nightside_df.shape)
new_nightside_df.head()

(90755, 4)


Unnamed: 0_level_0,MagLat_OCB,MLT_OCB,MagLat_B2I,MLT_B2I
date7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-01 00:43:52,-73.45,3.7,-68.64,4.65
2009-01-01 00:49:34,-75.67,22.69,-67.32,20.68
2009-01-01 01:38:11,78.38,15.04,71.29,16.92
2009-01-01 02:30:38,-72.47,0.84,-66.21,4.35
2009-01-01 02:33:29,-69.79,23.02,-66.81,22.21


An extra 90,000+ datapoints. 
Now for datacleaning, logically we can remove date 3 column due to inferring that OCB boundary will have the correct numbers. 

In [28]:
#new_nightside_df['Nightside'] = 1
#['Dayside'] = 0

#
#new_nightside_df.head()
total_nightside_df = pd.concat([new_nightside_df,df])
total_nightside_df.describe()

Unnamed: 0,MagLat_OCB,MLT_OCB,MagLat_B2I,MLT_B2I
2009-01-01 00:43:52,-73.45,3.7,-68.64,4.65
2009-01-01 00:49:34,-75.67,22.69,-67.32,20.68
2009-01-01 01:38:11,78.38,15.04,71.29,16.92
2009-01-01 02:30:38,-72.47,0.84,-66.21,4.35
2009-01-01 02:33:29,-69.79,23.02,-66.81,22.21


In [29]:
# import spacepy.pycdf as cdf
# # Access the remote URL for the CDF data 
# remote_cdf_url = "https://cdaweb.sci.gsfc.nasa.gov/pub/data/omni/omni_cdaweb/omni2_1min_avg.cdf"
# 
# # Open the CDF file directly from the URL
# with cdf.CDF(remote_cdf_url) as cdf_file:
#     # Access a specific variable (e.g., IMF Bx component)
#     bx_data = cdf_file['BX_GSE'][:]
#     print(bx_data)

CDFError: NO_SUCH_CDF: The specified CDF does not exist.

In [1]:
# from cdasws import CdasWs
# 
# # Initialize the CDAWeb service
# service = CdasWs()
# 
# # Query a specific dataset remotely, here we are using an example dataset ID
# dataset_id = "OMNI_HRO_1MIN"  # Example dataset for OMNI solar wind data
# start_time = '2023-01-01T00:00:00Z'
# end_time = '2023-01-31T23:59:59Z'
# parameters = ['BX_GSE', 'BY_GSE', 'BZ_GSE', 'Vx']
# parameters_string = ','.join(parameters)
# # Query data remotely (selecting key parameters)
# result = service.get_data(dataset_id, start_time, end_time, parameters_string)
# 
# # Accessing the data as pandas DataFrame
# result['data']
# 
# # Display the first few rows
# print(result.head())

ParserError: Unknown string format: BX_GSE,BY_GSE,BZ_GSE,Vx

### Notes
1. Try with a decision tree (sense of is there a pattern) (~65% accuracy, hopefully better than random)
2. Try to fit a neural network (CNNs for sphere or globe, find way to fit 3D to 2D, ignore time for now, then account for later)
3. 
2. 