## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import glob 
import os

## Reading multiple station Data (+Extracting Meta Data)

In [2]:
def get_csv_files():
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    raw_data_dir = os.path.join(parent_dir, "Raw Station Datasets")
    csv_files = glob.glob(os.path.join(raw_data_dir, '*.csv'))
    filtered_files = sorted([file for file in csv_files if os.path.basename(file)[:3].isdigit()],
                            key=lambda x: int(os.path.basename(x)[:3]))
    return filtered_files

def load_csv_files(file_list):
    dfs = []
    for file in file_list:
        try:
            # Extract Metadata + Header data
            with open(file, 'r') as f:
                meta_lines = [f.readline().strip() for _ in range(12)]

            # Extract only metadata from meta_lines
            metadata = {}
            for line in meta_lines[4:10]:  # Lines 5-10
                key, value = line.split(': ')
                metadata[key.strip('# ')] = value

            # Combine the Parameter + it's Units (lines 11 and 12)
            headers = meta_lines[10].split(',')  # Line 11
            units = meta_lines[11].split(',')  # Line 12

            new_headers = []
            for header, unit in zip(headers, units):
                if unit.strip():
                    new_headers.append(f'{header.strip()}({unit.strip()})')
                else:
                    new_headers.append(header.strip())  # Account for no units, first two rows (Station_ID and Date_time)

            # Add data starting from Line 13 for every csv file
            df = pd.read_csv(file, skiprows=12, names=new_headers)
            
            # Create new columns hosting meta data (Will be converted to Categorical Variable later)
            for key, value in metadata.items():
                df[key] = value

            # Appending each DataFrame to the list
            dfs.append(df)
            print(f"Successfully loaded: {os.path.basename(file)}")
        except Exception as e:
            print(f"Error loading {os.path.basename(file)}: {e}")
    return dfs

# Loading Step
csv_files = get_csv_files()
dfs = load_csv_files(csv_files)
print(f"\nNumber of DataFrames loaded: {len(dfs)}")

Successfully loaded: 001HI.2024-10-07.csv
Successfully loaded: 002HI.2024-10-07.csv
Successfully loaded: 003HI.2024-10-13.csv
Successfully loaded: 004HI.2024-10-13.csv
Successfully loaded: 005HI.2024-10-13.csv
Successfully loaded: 006HI.2024-10-13.csv
Successfully loaded: 007HI.2024-10-13.csv
Successfully loaded: 008HI.2024-10-13.csv
Successfully loaded: 009HI.2024-10-13.csv
Successfully loaded: 010HI.2024-10-13.csv
Successfully loaded: 011HI.2024-10-13.csv


  df = pd.read_csv(file, skiprows=12, names=new_headers)


Successfully loaded: 012HI.2024-10-13.csv
Successfully loaded: 013HI.2024-10-13.csv
Successfully loaded: 014HI.2024-10-13.csv
Successfully loaded: 015HI.2024-10-13.csv
Successfully loaded: 016HI.2024-10-13.csv
Successfully loaded: 017HI.2024-10-07.csv
Successfully loaded: 018HI.2024-10-20.csv
Successfully loaded: 019HI.2024-10-20.csv
Successfully loaded: 020HI.2024-10-20.csv
Successfully loaded: 021HI.2024-10-20.csv
Successfully loaded: 022HI.2024-10-20.csv
Successfully loaded: 023HI.2024-05-16.csv


  df = pd.read_csv(file, skiprows=12, names=new_headers)


Successfully loaded: 024HI.2024-10-20.csv
Successfully loaded: 025HI.2024-10-20.csv
Successfully loaded: 026HI.2024-10-20.csv
Successfully loaded: 027HI.2024-10-20.csv
Successfully loaded: 028HI.2024-09-03.csv
Successfully loaded: 029HI.2024-10-20.csv
Successfully loaded: 030HI.2024-10-20.csv
Successfully loaded: 031HI.2024-10-20.csv
Successfully loaded: 032HI.2024-11-10.csv
Successfully loaded: 033HI.2024-11-10.csv
Successfully loaded: 034HI.2024-11-10.csv
Successfully loaded: 035HI.2024-11-10.csv
Successfully loaded: 036HI.2024-11-10.csv
Successfully loaded: 037HI.2024-11-10.csv
Successfully loaded: 038HI.2024-11-10.csv
Successfully loaded: 039HI.2024-11-10.csv
Successfully loaded: 040HI.2024-11-10.csv
Successfully loaded: 041HI.2024-11-10.csv
Successfully loaded: 042HI.2024-11-10.csv
Successfully loaded: 043HI.2024-06-16.csv
Successfully loaded: 044HI.2024-06-16.csv
Successfully loaded: 045HI.2024-06-16.csv
Successfully loaded: 046HI.2024-11-18.csv
Successfully loaded: 047HI.2024-11

In [5]:
dfs[5] #Accessing, only the first Station_ID file! 

Unnamed: 0,Station_ID,Date_Time,air_temp_set_1(Celsius),relative_humidity_set_1(%),wind_speed_set_1(m/s),wind_direction_set_1(Degrees),solar_radiation_set_1(W/m**2),soil_temp_set_1(Celsius),precip_accum_five_minute_set_1(Millimeters),soil_moisture_set_1(%),...,wind_chill_set_1d(Celsius),wind_cardinal_direction_set_1d(code),heat_index_set_1d(Celsius),dew_point_temperature_set_1d(Celsius),STATION,STATION NAME,LATITUDE,LONGITUDE,ELEVATION [ft],STATE
0,006HI,2022-04-25T00:00:00Z,19.82,92.54,3.47,94.5,80.71,21.79,0.0,76.2,...,,E,,18.57,006HI,Spencer,19.96400,-155.25000,1539.0,HI
1,006HI,2022-04-25T00:05:00Z,19.78,93.15,3.33,106.8,116.51,21.80,0.0,76.2,...,,ESE,,18.63,006HI,Spencer,19.96400,-155.25000,1539.0,HI
2,006HI,2022-04-25T00:10:00Z,19.94,93.73,2.94,105.2,169.84,21.79,0.0,76.2,...,,ESE,,18.89,006HI,Spencer,19.96400,-155.25000,1539.0,HI
3,006HI,2022-04-25T00:15:00Z,20.09,93.62,3.88,102.8,164.95,21.76,0.0,76.2,...,,ESE,,19.02,006HI,Spencer,19.96400,-155.25000,1539.0,HI
4,006HI,2022-04-25T00:20:00Z,19.98,93.86,3.58,99.4,73.22,21.74,0.0,76.1,...,,E,,18.95,006HI,Spencer,19.96400,-155.25000,1539.0,HI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255536,006HI,2024-10-13T11:40:00Z,18.65,70.92,2.46,187.0,0.00,21.96,0.0,58.5,...,,S,,13.24,006HI,Spencer,19.96400,-155.25000,1539.0,HI
255537,006HI,2024-10-13T11:45:00Z,18.70,71.36,2.46,189.8,0.00,21.93,0.0,58.5,...,,S,,13.38,006HI,Spencer,19.96400,-155.25000,1539.0,HI
255538,006HI,2024-10-13T11:50:00Z,18.60,72.25,1.87,193.9,0.00,21.91,0.0,58.5,...,,SSW,,13.47,006HI,Spencer,19.96400,-155.25000,1539.0,HI
255539,006HI,2024-10-13T11:55:00Z,18.48,73.31,2.10,194.7,0.00,21.89,0.0,58.5,...,,SSW,,13.58,006HI,Spencer,19.96400,-155.25000,1539.0,HI


In [6]:
dfs[5].to_csv("006HI.csv", index=False)

## Concatenate all Station Data into a single Dataframe

In [4]:
# Concatenate all DataFrames in the list
combined_df = pd.concat(dfs, ignore_index=True)

# Converting the columns which merely get replicated to categorical variable to save space!
categorical_columns = ['Station_ID', 'STATION', 'STATION NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION [ft]', 'STATE']

for col in categorical_columns:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].astype('category')

KeyboardInterrupt: 

In [5]:
combined_df[combined_df['Station_ID'] == '017HI'] #Accessing specific Station's Data

Unnamed: 0,Station_ID,Date_Time,pressure_set_1(Pascals),air_temp_set_1(Celsius),relative_humidity_set_1(%),wind_speed_set_1(m/s),wind_direction_set_1(Degrees),wind_gust_set_1(m/s),solar_radiation_set_1(W/m**2),soil_temp_set_1(Celsius),...,STATION,STATION NAME,LATITUDE,LONGITUDE,ELEVATION [ft],STATE,soil_temp_set_2(Celsius),soil_temp_set_3(Celsius),soil_temp_set_4(Celsius),soil_moisture_set_3(%)
508043,017HI,2022-12-05T00:00:00Z,,21.85,72.28,0.81,116.1,,232.55,20.53,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,19.97,20.60,,43.9
508044,017HI,2022-12-05T00:05:00Z,,21.62,72.28,2.40,90.8,,381.14,20.56,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,19.99,20.58,,43.9
508045,017HI,2022-12-05T00:10:00Z,,21.75,73.15,2.20,99.3,,467.97,20.60,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.03,20.61,,43.9
508046,017HI,2022-12-05T00:15:00Z,,22.06,71.56,3.11,91.1,,755.71,20.63,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.06,20.59,,43.9
508047,017HI,2022-12-05T00:20:00Z,,21.88,70.89,1.76,89.5,,271.18,20.68,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.05,20.58,,43.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700826,017HI,2024-10-07T11:40:00Z,94450.0,16.96,71.69,1.40,95.7,2.17,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.96,21.37,20.58,37.7
700827,017HI,2024-10-07T11:45:00Z,94450.0,16.99,72.43,1.32,151.1,2.28,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.99,21.38,20.57,37.7
700828,017HI,2024-10-07T11:50:00Z,94450.0,16.80,73.51,1.29,151.7,1.89,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.95,21.37,20.56,37.7
700829,017HI,2024-10-07T11:55:00Z,94440.0,16.90,74.96,1.12,156.5,2.28,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.92,21.39,20.58,37.7


## Saving the Reformatted + Cleaned Dataframe as .CSV

In [46]:
csv_filename = "combined_data.csv"
combined_df.to_csv(csv_filename, index=False)