## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import glob 
import os

## Reading multiple station Data (+Extracting Meta Data)

In [2]:
def get_csv_files():
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    raw_data_dir = os.path.join(parent_dir, "Raw Station Datasets")
    csv_files = glob.glob(os.path.join(raw_data_dir, '*.csv'))
    filtered_files = sorted([file for file in csv_files if os.path.basename(file)[:3].isdigit()],
                            key=lambda x: int(os.path.basename(x)[:3]))
    return filtered_files

def load_csv_files(file_list):
    dfs = []
    for file in file_list:
        try:
            # Extract Metadata + Header data
            with open(file, 'r') as f:
                meta_lines = [f.readline().strip() for _ in range(12)]

            # Extract only metadata from meta_lines
            metadata = {}
            for line in meta_lines[4:10]:  # Lines 5-10
                key, value = line.split(': ')
                metadata[key.strip('# ')] = value

            # Combine the Parameter + it's Units (lines 11 and 12)
            headers = meta_lines[10].split(',')  # Line 11
            units = meta_lines[11].split(',')  # Line 12

            new_headers = []
            for header, unit in zip(headers, units):
                if unit.strip():
                    new_headers.append(f'{header.strip()}({unit.strip()})')
                else:
                    new_headers.append(header.strip())  # Account for no units, first two rows (Station_ID and Date_time)

            # Add data starting from Line 13 for every csv file
            df = pd.read_csv(file, skiprows=12, names=new_headers)
            
            # Create new columns hosting meta data (Will be converted to Categorical Variable later)
            for key, value in metadata.items():
                df[key] = value

            # Appending each DataFrame to the list
            dfs.append(df)
            print(f"Successfully loaded: {os.path.basename(file)}")
        except Exception as e:
            print(f"Error loading {os.path.basename(file)}: {e}")
    return dfs

# Loading Step
csv_files = get_csv_files()
dfs = load_csv_files(csv_files)
print(f"\nNumber of DataFrames loaded: {len(dfs)}")

Successfully loaded: 001HI.2024-10-07.csv
Successfully loaded: 002HI.2024-10-07.csv
Successfully loaded: 017HI.2024-10-07.csv

Number of DataFrames loaded: 3


In [3]:
dfs[0] #Accessing, only the first Station_ID file! 

Unnamed: 0,Station_ID,Date_Time,pressure_set_1(Pascals),air_temp_set_1(Celsius),relative_humidity_set_1(%),wind_speed_set_1(m/s),wind_direction_set_1(Degrees),wind_gust_set_1(m/s),solar_radiation_set_1(W/m**2),soil_temp_set_1(Celsius),...,dew_point_temperature_set_1d(Celsius),altimeter_set_1d(Pascals),sea_level_pressure_set_1d(Pascals),wet_bulb_temp_set_1d(Celsius),STATION,STATION NAME,LATITUDE,LONGITUDE,ELEVATION [ft],STATE
0,001HI,2022-04-25T00:00:00Z,,20.46,73.61,2.60,204.0,,705.95,21.61,...,15.56,,,,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
1,001HI,2022-04-25T00:05:00Z,,20.93,71.86,2.33,202.3,,927.95,21.58,...,15.64,,,,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
2,001HI,2022-04-25T00:10:00Z,,21.38,70.77,2.08,203.0,,840.02,21.47,...,15.83,,,,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
3,001HI,2022-04-25T00:15:00Z,,20.95,71.31,2.70,196.4,,548.92,21.48,...,15.53,,,,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
4,001HI,2022-04-25T00:20:00Z,,20.69,73.02,2.25,201.9,,550.24,21.62,...,15.66,,,,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254005,001HI,2024-10-07T11:40:00Z,90940.0,15.56,79.44,0.82,110.3,1.34,0.00,19.68,...,11.99,102087.37,101803.64,13.39,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
254006,001HI,2024-10-07T11:45:00Z,90940.0,15.49,74.91,0.75,117.4,1.37,0.00,19.67,...,11.02,102087.37,101806.53,12.81,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
254007,001HI,2024-10-07T11:50:00Z,90940.0,15.50,76.07,0.88,109.4,1.28,0.00,19.67,...,11.27,102087.37,101806.08,12.95,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI
254008,001HI,2024-10-07T11:55:00Z,90930.0,15.43,75.95,0.84,121.9,1.37,0.00,19.65,...,11.17,102076.14,101797.73,12.87,001HI,Kula Ag,20.75790,-156.32000,3163.0,HI


## Concatenate all Station Data into a single Dataframe

In [4]:
# Concatenate all DataFrames in the list
combined_df = pd.concat(dfs, ignore_index=True)

# Converting the columns which merely get replicated to categorical variable to save space!
categorical_columns = ['Station_ID', 'STATION', 'STATION NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION [ft]', 'STATE']

for col in categorical_columns:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].astype('category')

In [5]:
combined_df[combined_df['Station_ID'] == '017HI'] #Accessing specific Station's Data

Unnamed: 0,Station_ID,Date_Time,pressure_set_1(Pascals),air_temp_set_1(Celsius),relative_humidity_set_1(%),wind_speed_set_1(m/s),wind_direction_set_1(Degrees),wind_gust_set_1(m/s),solar_radiation_set_1(W/m**2),soil_temp_set_1(Celsius),...,STATION,STATION NAME,LATITUDE,LONGITUDE,ELEVATION [ft],STATE,soil_temp_set_2(Celsius),soil_temp_set_3(Celsius),soil_temp_set_4(Celsius),soil_moisture_set_3(%)
508043,017HI,2022-12-05T00:00:00Z,,21.85,72.28,0.81,116.1,,232.55,20.53,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,19.97,20.60,,43.9
508044,017HI,2022-12-05T00:05:00Z,,21.62,72.28,2.40,90.8,,381.14,20.56,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,19.99,20.58,,43.9
508045,017HI,2022-12-05T00:10:00Z,,21.75,73.15,2.20,99.3,,467.97,20.60,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.03,20.61,,43.9
508046,017HI,2022-12-05T00:15:00Z,,22.06,71.56,3.11,91.1,,755.71,20.63,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.06,20.59,,43.9
508047,017HI,2022-12-05T00:20:00Z,,21.88,70.89,1.76,89.5,,271.18,20.68,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.05,20.58,,43.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700826,017HI,2024-10-07T11:40:00Z,94450.0,16.96,71.69,1.40,95.7,2.17,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.96,21.37,20.58,37.7
700827,017HI,2024-10-07T11:45:00Z,94450.0,16.99,72.43,1.32,151.1,2.28,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.99,21.38,20.57,37.7
700828,017HI,2024-10-07T11:50:00Z,94450.0,16.80,73.51,1.29,151.7,1.89,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.95,21.37,20.56,37.7
700829,017HI,2024-10-07T11:55:00Z,94440.0,16.90,74.96,1.12,156.5,2.28,0.00,,...,017HI,Piiholo,20.8415,-156.2948,2090.0,HI,20.92,21.39,20.58,37.7


## Saving the Reformatted + Cleaned Dataframe as .CSV

In [46]:
csv_filename = "combined_data.csv"
combined_df.to_csv(csv_filename, index=False)