In [1]:
# Harsh code comments:
# - we should break out the get_events_df_from_goes() and get_events_df_for_date_range() functions to a new class,
#   maybe FileReadWrite
# - we need to clean up some columns so that in an events-df we just have two time columns: start_datetime and end_datetime
# - let's doublecheck that we're not throwing out any rows when we're reading the files since the initial discard rowcount
#   is hardcoded to 13

# to-to for vava:
# - add get_events_df_for_date_range() to FileReadWrite class

In [2]:
import glob
import os
import pandas as pd
from datetime import datetime

# import custom libraries
import sys
sys.path.append('.')
from file_read_write import FileReadWrite


def get_df_from_from_date_range(path, start_date, end_date):
    # Initializing an empty array to store the Dataframes
    dfs = []

    file_extension = ".txt"

    # Convert start and end dates to datetime objects
    start_datetime = datetime.strptime(start_date, "%Y%m%d")
    end_datetime = datetime.strptime(end_date, "%Y%m%d")

    # Create a glob pattern to match text files within the specified directory
    pattern = os.path.join(path, f"*{file_extension}")

    # Use glob to get a list of matching file paths
    all_files = glob.glob(pattern)

    # Initialize an empty list to store file paths that meet the date condition
    filtered_file_paths = []

    # Iterate through the files and filter based on the date condition
    for file_path in all_files:
        # Extract the date from the file name
        file_date = os.path.basename(file_path)[:8]

        # Convert the date string to a datetime object
        file_datetime = datetime.strptime(file_date, "%Y%m%d")

        # Check if the file date falls within the specified date range
        if start_datetime <= file_datetime <= end_datetime:
            # If the condition is satisfied, add the file path to the list
            filtered_file_paths.append(file_path)

    # Iterate through the filtered file paths, read each file into a DataFrame, and add the 'Date' column
    for file_path in filtered_file_paths:
        # Extract the date from the file name
        file_date = os.path.basename(file_path)[:8]

        # Read the file into a DataFrame
        df = FileReadWrite.get_df_from_goes_file(file_path)  # Adjust file reading as needed

        # Add the "Date" column with the file date
        df["Date"] = file_date

        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate the list of DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    # Sort the DataFrame by the "Date" column in ascending order
    result_df = result_df.sort_values(by=["Date", "Event"])

    # Reset the index of the sorted DataFrame
    result_df = result_df.reset_index(drop=True)

    return result_df


directory_path = "/Users/aishwarya/Library/CloudStorage/GoogleDrive-aishsk6@vt.edu/My Drive/ML_project/2015/2015_events/"

df_within_daterange = get_df_from_from_date_range(
    directory_path, "20150101", "20150105"
)
print(df_within_daterange)

    Event Begin   Max   End  Obs  Q Type Loc/Freq Particulars_a      Date
0    3690  0138  0139  0141  LEA  3  FLA   S08E54            SF  20150101
1    3700  0225  0227  0229  LEA  3  FLA   S08E54            SF  20150101
2    3710  0316  0316  0320  LEA  3  FLA   S08E54            SF  20150101
3    3720  0340  0341  0344  LEA  3  FLA   S08E54            SF  20150101
4    3730  0351  ////  0417  LEA  C  RSP  109-171         CTM/1  20150101
..    ...   ...   ...   ...  ... ..  ...      ...           ...       ...
168  4980  2200  2212  2241  HOL  3  FLA   S06W15            1F  20150105
169  4980  2203  2213  2220  G15  5  XRA     1-8A          C4.1  20150105
170  4980  2204  ////  2359  LEA  C  RSP  113-180         CTM/1  20150105
171  4990  2307  2320  2336  G15  5  XRA     1-8A          C4.4  20150105
172  4990  2332  2334  2338  HOL  3  FLA   S11W64            SF  20150105

[173 rows x 10 columns]


In [3]:
df_within_daterange

Unnamed: 0,Event,Begin,Max,End,Obs,Q,Type,Loc/Freq,Particulars_a,Date
0,3690,0138,0139,0141,LEA,3,FLA,S08E54,SF,20150101
1,3700,0225,0227,0229,LEA,3,FLA,S08E54,SF,20150101
2,3710,0316,0316,0320,LEA,3,FLA,S08E54,SF,20150101
3,3720,0340,0341,0344,LEA,3,FLA,S08E54,SF,20150101
4,3730,0351,////,0417,LEA,C,RSP,109-171,CTM/1,20150101
...,...,...,...,...,...,...,...,...,...,...
168,4980,2200,2212,2241,HOL,3,FLA,S06W15,1F,20150105
169,4980,2203,2213,2220,G15,5,XRA,1-8A,C4.1,20150105
170,4980,2204,////,2359,LEA,C,RSP,113-180,CTM/1,20150105
171,4990,2307,2320,2336,G15,5,XRA,1-8A,C4.4,20150105
