# Before

## pips and includes

In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

## Constants

In [2]:
DATA_DIRECTORY = "../data/"

## load csv files:

In [3]:
data_files = [f for f in os.listdir(DATA_DIRECTORY) if f.endswith('.csv')]

dataframes = {}
for file in tqdm(data_files, desc="Loading CSV files"):
    file_path = os.path.join(DATA_DIRECTORY, file)
    df_name = os.path.splitext(file)[0]
    dataframes[df_name] = pd.read_csv(file_path, low_memory=False)


Loading CSV files: 100%|██████████| 70/70 [03:31<00:00,  3.02s/it]


# Preprocessing

## Remove unecesery columns

### remove 'Time' column:

In [13]:
for df_name, df in dataframes.items():
    if 'Time' in df.columns:
        df.drop(columns=['Time'], inplace=True)

### remove radiation columns:

In [16]:
columns_to_remove = ['Grad (w/m^2)', 'DiffR (w/m^2)', 'NIP (w/m^2)']

for df_name, df in dataframes.items():
    df.drop(columns=[col for col in columns_to_remove if col in df.columns], inplace=True)


### remove 'BP' column:

In [18]:
for df_name, df in dataframes.items():
    if 'BP (hPa)' in df.columns:
        df.drop(columns=['BP (hPa)'], inplace=True)

### remove the 'Ws10mm (m/s)' , 'Ws1mm (m/s)' colums:

In [4]:
for df_name, df in dataframes.items():
    columns_to_remove = ['Ws10mm (m/s)', 'Ws1mm (m/s)']
    df.drop(columns=[col for col in columns_to_remove if col in df.columns], inplace=True)

## Time

### format the time:

In [22]:
for df_name, df in dataframes.items():
    if 'Date Time' in df.columns:
        df['Date Time'] = pd.to_datetime(df.pop('Date Time'), format="%d/%m/%Y %H:%M")

### add column with the year:

In [24]:
for df_name, df in dataframes.items():
    if 'Date Time' in df.columns:
        df['Year'] = df['Date Time'].dt.year

### Time cicles (days ans years):

In [29]:
day = 24*60*60
year = (365.2425)*day

for df_name, df in dataframes.items():
    if 'Date Time' in df.columns:
        timestamp_s = df['Date Time'].map(pd.Timestamp.timestamp)
        df['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
        df['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
        df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
        df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))
        df.drop(columns=['Date Time'], inplace=True)


## Data Imputation

### 1 missing value:

In [None]:
na_values = ['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN']

for df_name, df in tqdm(dataframes.items(), desc="Processing DataFrames"):
    if 'TD (degC)' in df.columns:
        df['TD (degC)'].replace(na_values, np.nan, inplace=True)
        
        # Fill NaN values wrapped with two non-NaN values:
        nan_wrapped_count = 0
        td_values = df['TD (degC)'].values
        for i in range(1, len(td_values) - 1):
            if pd.isna(td_values[i]) and not pd.isna(td_values[i - 1]) and not pd.isna(td_values[i + 1]):
                try:
                    td_values[i] = (float(td_values[i - 1]) + float(td_values[i + 1])) / 2
                    nan_wrapped_count += 1
                except ValueError as e:
                    print(f"ValueError encountered in {df_name} at index {i}: {e}")
        print(f"Number of NaN values wrapped with two non-NaN values and filled in {df_name} station: {nan_wrapped_count} which is {nan_wrapped_count / len(df) * 100}% of the data")

In [None]:
na_values = ['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN']

for df_name, df in tqdm(dataframes.items(), desc="Processing DataFrames"):
    if 'TDmax (degC)' in df.columns:
        df['TDmax (degC)'].replace(na_values, np.nan, inplace=True)
        
        # Fill NaN values wrapped with two non-NaN values:
        nan_wrapped_count = 0
        td_values = df['TDmax (degC)'].values
        for i in range(1, len(td_values) - 1):
            if pd.isna(td_values[i]) and not pd.isna(td_values[i - 1]) and not pd.isna(td_values[i + 1]):
                try:
                    td_values[i] = (float(td_values[i - 1]) + float(td_values[i + 1])) / 2
                    nan_wrapped_count += 1
                except ValueError as e:
                    print(f"ValueError encountered in {df_name} at index {i}: {e}")
        print(f"Number of NaN values wrapped with two non-NaN values and filled in {df_name} station: {nan_wrapped_count} which is {nan_wrapped_count / len(df) * 100}% of the data")

In [None]:
na_values = ['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN']

for df_name, df in tqdm(dataframes.items(), desc="Processing DataFrames"):
    if 'TDmin (degC)' in df.columns:
        df['TDmin (degC)'].replace(na_values, np.nan, inplace=True)
        
        # Fill NaN values wrapped with two non-NaN values:
        nan_wrapped_count = 0
        td_values = df['TDmin (degC)'].values
        for i in range(1, len(td_values) - 1):
            if pd.isna(td_values[i]) and not pd.isna(td_values[i - 1]) and not pd.isna(td_values[i + 1]):
                try:
                    td_values[i] = (float(td_values[i - 1]) + float(td_values[i + 1])) / 2
                    nan_wrapped_count += 1
                except ValueError as e:
                    print(f"ValueError encountered in {df_name} at index {i}: {e}")
        print(f"Number of NaN values wrapped with two non-NaN values and filled in {df_name} station: {nan_wrapped_count} which is {nan_wrapped_count / len(df) * 100}% of the data")

In [None]:
na_values = ['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN']

for df_name, df in tqdm(dataframes.items(), desc="Processing DataFrames"):
    if 'RH (%)' in df.columns:
        df['RH (%)'].replace(na_values, np.nan, inplace=True)
        
        # Fill NaN values wrapped with two non-NaN values:
        nan_wrapped_count = 0
        td_values = df['RH (%)'].values
        for i in range(1, len(td_values) - 1):
            if pd.isna(td_values[i]) and not pd.isna(td_values[i - 1]) and not pd.isna(td_values[i + 1]) and not pd.isna(td_values[i - 2]) and not pd.isna(td_values[i + 2]):
                trend_before = td_values[i - 1] < td_values[i - 2] if i - 2 >= 0 else None
                trend_after = td_values[i + 1] < td_values[i + 2] if i + 2 < len(td_values) else None
                if trend_before == trend_after:
                    try:
                        td_values[i] = (float(td_values[i - 1]) + float(td_values[i + 1])) / 2
                        nan_wrapped_count += 1
                    except ValueError as e:
                        print(f"ValueError encountered in {df_name} at index {i}: {e}")
        print(f"Number of NaN values wrapped with two non-NaN values and filled in {df_name} station: {nan_wrapped_count} which is {nan_wrapped_count / len(df) * 100}% of the data")

### 2 missing values together:

In [None]:
na_values = ['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN']
column_to_fill = 'RH (%)'

for df_name, df in tqdm(dataframes.items(), desc="Processing DataFrames"):
    if column_to_fill in df.columns:
        df[column_to_fill].replace(na_values, np.nan, inplace=True)
        
        # Fill two consecutive NaN values wrapped with two non-NaN values:
        nan_wrapped_count = 0
        td_values = df[column_to_fill].values
        i = 2  # Start from index 2 to ensure i-2 is valid
        while i < len(td_values) - 2:
            if (pd.isna(td_values[i]) and pd.isna(td_values[i+1]) and
                not pd.isna(td_values[i - 2]) and not pd.isna(td_values[i - 1]) and
                not pd.isna(td_values[i + 2]) and not pd.isna(td_values[i + 3])):
                
                val1 = float(td_values[i - 2])
                val2 = float(td_values[i - 1])
                val3 = float(td_values[i + 2])
                val4 = float(td_values[i + 3])
                
                # Determine trends
                trend_before = val2 < val1
                trend_after = val4 < val3
                if trend_before == trend_after:
                    try:
                        diff = val3 - val2
                        td_values[i] = val2 + diff / 3
                        td_values[i + 1] = val2 + diff * 2 / 3
                        nan_wrapped_count += 2
                        i += 2  # Skip the next index as it's already processed
                        continue
                    except ValueError as e:
                        print(f"ValueError encountered in {df_name} at indices {i} and {i+1}: {e}")
            i += 1
        print(f"Number of NaN values wrapped with two non-NaN values and filled in {df_name} station: {nan_wrapped_count} which is {nan_wrapped_count / len(df) * 100:.2f}% of the data")

### 3 missing values together:

In [None]:
na_values = ['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN']
column_to_fill = 'RH (%)'

for df_name, df in tqdm(dataframes.items(), desc="Processing DataFrames"):
    if column_to_fill in df.columns:
        df[column_to_fill].replace(na_values, np.nan, inplace=True)
        
        # Fill three consecutive NaN values wrapped with two non-NaN values:
        nan_wrapped_count = 0
        td_values = df[column_to_fill].values
        i = 2  # Start from index 2 to ensure i-2 is valid
        while i < len(td_values) - 4:
            if (pd.isna(td_values[i]) and pd.isna(td_values[i+1]) and pd.isna(td_values[i+2]) and
                not pd.isna(td_values[i - 2]) and not pd.isna(td_values[i - 1]) and
                not pd.isna(td_values[i + 3]) and not pd.isna(td_values[i + 4])):
                
                val1 = float(td_values[i - 2])
                val2 = float(td_values[i - 1])
                val3 = float(td_values[i + 3])
                val4 = float(td_values[i + 4])
                
                # Determine trends
                trend_before = val2 < val1
                trend_after = val4 < val3
                if trend_before == trend_after:
                    try:
                        diff = val3 - val2
                        td_values[i] = val2 + diff / 4
                        td_values[i + 1] = val2 + (diff * 2) / 4
                        td_values[i + 2] = val2 + (diff * 3) / 4
                        nan_wrapped_count += 3
                        i += 3  # Skip the next indices as they're already processed
                        continue
                    except ValueError as e:
                        print(f"ValueError encountered in {df_name} at indices {i}, {i+1}, and {i+2}: {e}")
            i += 1
        print(f"Number of NaN values wrapped with three non-NaN values and filled in {df_name} station: {nan_wrapped_count} which is {nan_wrapped_count / len(df) * 100:.2f}% of the data")

## Wind

# Show and Save

## Display

### heads:

In [None]:
# Display the head of all dataframes in the dataframes dictionary
for df_name, df in dataframes.items():
    print(f"DataFrame: {df_name}")
    display(df.head())

### time:

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Plot the Day sin and cos on the first subplot
ax[0].plot(np.array(df['Day sin'])[:48*6])
ax[0].plot(np.array(df['Day cos'])[:48*6])
ax[0].legend(['Day sin', 'Day cos'])
ax[0].set_xlabel('Time [h]')
ax[0].set_title('Time of 48 hours (2 days) signal')

# Plot the Year sin and cos on the second subplot
ax[1].plot(np.array(df['Year sin'])[:365*2*24*6])
ax[1].plot(np.array(df['Year cos'])[:365*2*24*6])
ax[1].legend(['Year sin', 'Year cos'])
ax[1].set_xlabel('Time [h]')
ax[1].set_title('Time of 2 years signal')
plt.show()

## Save Changes

In [24]:
# Save the dataframes back to CSV files with the changes made
for df_name, df in tqdm(dataframes.items(), desc="Saving DataFrames"):
    file_path = os.path.join(DATA_DIRECTORY, f"{df_name}.csv")
    df.to_csv(file_path, index=False)

Saving DataFrames: 100%|██████████| 70/70 [12:30<00:00, 10.72s/it]
