In [1]:
from rich.progress import track
import os
os.getcwd()

'/home/ajivani/time_series_transformers'

In [2]:
import os
import requests
from urllib.parse import urljoin

base_url = "https://spdf.gsfc.nasa.gov/pub/data/omni/high_res_omni/modified/monthly_1min/"

save_dir = "./solar_wind_data/omni/high_res_1min/"

# Check and create directory if not exists
os.makedirs(save_dir, exist_ok=True)

for year in track(range(2014, 2018), description="Processing Year by Year..."):
# for year in track(range(1995, 2023), description="Processing Year by Year..."):
    for month in range(1, 13):
        # Construct the filename and its URL
        filename = f"omni_min{year}{str(month).zfill(2)}.asc"
        file_url = urljoin(base_url, filename)

        # Attempt to download the file
        response = requests.get(file_url, stream=True)

        # If the download is successful, save the file
        if response.status_code == 200:
            with open(os.path.join(save_dir, filename), 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
        else:
            print(f"Failed to download {filename}.")

print("Download complete.")

Output()

Download complete.


In [4]:
import time
import pandas as pd
import geopack
import datetime
import geopack.geopack as gp
from datetime import datetime
import datetime
import numpy as np

Load IGRF coefficients ...


In [6]:
# Define the time range
start_year, start_month = 2014, 1
end_year, end_month = 2017, 12

# Prepare an empty DataFrame to hold the data
df = pd.DataFrame()

# Loop through each year and month
for year in track(range(start_year, end_year+1)):
    for month in range(1, 13):
        if year == start_year and month < start_month:
            continue
        if year == end_year and month > end_month:
            break
        # Construct the filename
        filename = f"omni_min{year}{str(month).zfill(2)}.asc"
        file_path = os.path.join(save_dir, filename)

        # Check if the file exists
        if os.path.isfile(file_path):
            # Read the file into a DataFrame
            df_temp = np.loadtxt(file_path)  # Add delimiter or other arguments if needed
            df = pd.concat([df, pd.DataFrame(df_temp)], ignore_index=True)
        else:
            print(f"{filename} not found.")

# Function to convert year, day of year, hour, and minute into a datetime object
def convert_to_datetime(row):
    return datetime.datetime(int(row[0]), 1, 1, int(row[2]), int(row[3])) + pd.Timedelta(days=int(row[1])-1)

# Apply the function to each row in the DataFrame
df['Datetime'] = df.apply(convert_to_datetime, axis=1)

# Set the new datetime column as the index
df.set_index('Datetime', inplace=True)
# Rename columns
df.drop(columns=df.columns[:4], axis=1, inplace=True)
df.columns = ['ID for IMF spacecraft', 'ID for SW Plasma spacecraft', '# of points in IMF averages', 
              '# of points in Plasma averages', 'Percent interp', 'Timeshift, sec', 'RMS, Timeshift', 
              'RMS, Phase front normal', 'Time btwn observations, sec', 'Field magnitude average', 
              'Bx, nT (GSE, GSM)', 'By, nT (GSE)', 'Bz, nT (GSE)', 'By, nT (GSM)', 'Bz, nT (GSM)', 'RMS SD B scalar',
              'RMS SD field vector', 'Flow speed, km/s', 'Vx Velocity, km/s, GSE', 'Vy Velocity, km/s, GSE', 
              'Vz Velocity, km/s, GSE', 'Proton Density, n/cc', 'Temperature, K', 'Flow pressure, nPa', 
              'Electric field, mV/m', 'Plasma beta', 'Alfven mach number', 'X(s/c), GSE, Re', 'Y(s/c), GSE, Re', 
              'Z(s/c), GSE, Re', 'BSN location, Xgse, Re', 'BSN location, Ygse, Re', 'BSN location, Zgse, Re',
              'AE-index, nT', 'AL-index, nT', 'AU-index, nT', 'SYM/D index, nT', 'SYM/H index, nT', 
              'ASY/D index, nT', 'ASY/H index, nT', 'Na/Np Ratio', 'Magnetosonic mach number']
df.to_pickle("./solar_wind_data/processed_data/OMNI_raw.pkl")
print("Data loading complete.")

Output()

Data loading complete.


In [7]:
df = pd.read_pickle("./solar_wind_data/processed_data/OMNI_raw.pkl")
df = df[['Bx, nT (GSE, GSM)', 'By, nT (GSM)', 'Bz, nT (GSM)', 'Vx Velocity, km/s, GSE',
         'Proton Density, n/cc', 'Temperature, K', 'Flow speed, km/s', 'Alfven mach number', 'Magnetosonic mach number']]

df = df.replace(99.9, np.nan)
df = df.replace(999.99, np.nan)
df = df.replace(9999.99, np.nan)
df = df.replace(99999.9, np.nan)
df = df.replace(9999999., np.nan)
for col in df.columns:
    mask = df[col].notna()
    a = mask.ne(mask.shift()).cumsum()
    df = df[(a.groupby(a).transform('size') < 15) | mask]
df = df.ffill()

df_final = df.dropna()

df_final.loc[:, 'B'] = np.sqrt(df_final['Bx, nT (GSE, GSM)']**2 + df_final['By, nT (GSM)']**2 + df_final['Bz, nT (GSM)']**2)

df_final.to_pickle("./solar_wind_data/processed_data/OMNI_final.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.loc[:, 'B'] = np.sqrt(df_final['Bx, nT (GSE, GSM)']**2 + df_final['By, nT (GSM)']**2 + df_final['Bz, nT (GSM)']**2)
