In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
DATA_DIR = "../../data/"
DATA_NAME = "bandar_abbas_1_year.csv"

# Load data
df = pd.read_csv(os.path.join(DATA_DIR, DATA_NAME))

FileNotFoundError: [Errno 2] No such file or directory: '../../data/bandar_abbas_1_year.csv'

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df[[ 'datetime', 't', 'p', 'u', 'dd', 'ff']]

In [None]:
clmns_name = {
    't':'temp',
    'p': 'pressure',
    'u':'humidity',
    'dd':'wind_direction',
    'ff': 'wind_speed'
}

In [None]:
df.rename(columns=clmns_name, inplace=True)

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
# Check for exact duplicate rows
print("Exact duplicate rows:", df.duplicated().sum())

# Check for duplicates based only on datetime (should be unique)
print("Duplicate datetime entries:", df.duplicated(subset=['datetime']).sum())

# Show duplicated datetime entries (if any)
duplicated_datetimes = df[df.duplicated(subset=['datetime'], keep=False)]
print("\nDuplicated datetime records (if any):")
print(duplicated_datetimes.sort_values('datetime').head(20))

# Check for near-duplicates in datetime (e.g., same minute)
df['datetime_rounded'] = pd.to_datetime(df['datetime']).dt.round('min')
print("\nRows with same rounded minute:", df.duplicated(subset=['datetime_rounded']).sum())

# Optional: inspect time gaps for irregular sampling
df_sorted = df.sort_values('datetime').copy()
df_sorted['time_diff'] = pd.to_datetime(df_sorted['datetime']).diff()
print("\nMost common time intervals:")
print(df_sorted['time_diff'].value_counts().head())

In [None]:
# Identify rows where humidity is still NaN after previous filling
hum_nan_mask = df['humidity'].isna()

# Show count of remaining NaNs
print("Remaining humidity NaNs:", hum_nan_mask.sum())

# Show first few rows with humidity NaN
print("\nFirst 10 rows with humidity NaN:")
df[hum_nan_mask].head(10)

# Check if remaining NaNs are at the start or end
print("\nNaNs in first 100 rows:", hum_nan_mask.iloc[:100].sum())
print("NaNs in last 100 rows:", hum_nan_mask.iloc[-100:].sum())

# Check time gaps around remaining NaNs (to see if part of long gaps)
if hum_nan_mask.sum() > 0:
    nan_df = df[hum_nan_mask].copy()
    print("\nSample datetime range of remaining NaNs:")
    print("Earliest:", nan_df['datetime'].min())
    print("Latest:", nan_df['datetime'].max())

In [None]:
df[hum_nan_mask].head(10)

In [None]:
# Ensure datetime is in correct datetime type and sort
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime').reset_index(drop=True)

# Set datetime as index for time-based interpolation
df = df.set_index('datetime')

# Drop the helper column used earlier
df = df.drop(columns=['datetime_rounded'], errors='ignore')

# Time-based interpolation for remaining NaNs in all numeric columns
df = df.interpolate(method='time')

# Optional: forward/backward fill any remaining edge NaNs
df = df.ffill().bfill()

In [None]:
df.info()

In [None]:
# 1. Ensure datetime index is sorted (already done, but safe to confirm)
assert df.index.is_monotonic_increasing, "Datetime index is not sorted!"

# 2. Calculate time differences between consecutive rows
time_diffs = df.index.to_series().diff()

# 3. Show unique time intervals and their frequencies
print("Most common time intervals:")
print(time_diffs.value_counts().head(10))

# 4. Check for unexpected gaps (e.g., > 180 minutes)
expected_freq = pd.Timedelta(minutes=10)  # adjust if your data has different freq
large_gaps = time_diffs[time_diffs > pd.Timedelta(minutes=180)]
print(f"\nNumber of large gaps (>180 min): {len(large_gaps)}")
if len(large_gaps) > 0:
    print("First few large gaps:")
    print(large_gaps.head())

# 5. Check for duplicate timestamps
duplicated_times = df.index.duplicated()
print(f"\nNumber of duplicate timestamps: {duplicated_times.sum()}")

In [None]:
df.describe().transpose()

In [None]:
import matplotlib.pyplot as plt

# 1. Basic statistics
print("Basic statistics:")
print(df.describe())

# 2. Lag-1 autocorrelation (expected to be high for meteorological variables)
print("\nLag-1 autocorrelation (values >0.8 typical for temp/pressure/humidity):")
autocorr = df[['temp', 'pressure', 'humidity', 'wind_speed']].apply(lambda x: x.autocorr(lag=1))
print(autocorr)

# 3. Histograms to identify anomalies or unrealistic distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
cols = ['temp', 'pressure', 'humidity', 'wind_speed', 'wind_direction']
for i, col in enumerate(cols):
    ax = axes[i // 3, i % 3]
    df[col].hist(bins=50, ax=ax)
    ax.set_title(f'{col} distribution')
# Hide unused subplot
axes[1, 2].set_visible(False)
plt.tight_layout()
plt.show()

# 4. Check for excessive zero wind speed (some calm periods are normal; very high % may indicate issues)
zero_wind_pct = (df['wind_speed'] == 0).mean() * 100
print(f"\nPercentage of zero wind speed readings: {zero_wind_pct:.2f}%")

# 5. Wind direction sanity check: 0° and 360° are equivalent; excess of either may indicate default/fill values
wind_dir_0 = (df['wind_direction'] == 0).sum()
wind_dir_360 = (df['wind_direction'] == 360).sum()
print(f"Wind direction = 0°: {wind_dir_0} occurrences")
print(f"Wind direction = 360°: {wind_dir_360} occurrences")

In [None]:
import numpy as np

# 1. Convert Meteorological Degrees to Radians for Math
# Formula: (270 - degrees) aligns Meteo North (0) to Math North (90)
wd_rad = np.deg2rad(270 - df['wind_direction'])

# 2. Calculate U and V
# U = East-West component, V = North-South component
df['u'] = df['wind_speed'] * np.cos(wd_rad)
df['v'] = df['wind_speed'] * np.sin(wd_rad)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("tab10")

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Wind vectors (u vs v) colored by time index
sc = axes[0, 0].scatter(df['u'], df['v'], 
                        c=range(len(df)), cmap='plasma', alpha=0.6, s=10)
axes[0, 0].set_xlabel('u (zonal, eastward)')
axes[0, 0].set_ylabel('v (meridional, northward)')
axes[0, 0].set_title('Wind Vector Distribution (colored by time)')
plt.colorbar(sc, ax=axes[0, 0], label='Time (hour index)')

# 2. Temperature and humidity over last 30 days
last_30d = df.last('30D')
ax2 = axes[0, 1].twinx()
ln1 = axes[0, 1].plot(last_30d.index, last_30d['temp'], 
                      color='tab:red', label='Temperature', alpha=0.8)
ln2 = ax2.plot(last_30d.index, last_30d['humidity'], 
               color='tab:blue', label='Humidity', alpha=0.8)
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Temperature (°C)', color='tab:red')
ax2.set_ylabel('Humidity (%)', color='tab:blue')
axes[0, 1].set_title('Last 30 Days: Temperature & Humidity')
lines = ln1 + ln2
labels = [l.get_label() for l in lines]
axes[0, 1].legend(lines, labels, loc='upper left')

# 3. Pressure trend with 24-hour rolling mean
df['pressure_24h'] = df['pressure'].rolling(window=24, min_periods=1).mean()
axes[1, 0].plot(df.index, df['pressure'], alpha=0.3, color='gray', label='Hourly')
axes[1, 0].plot(df.index, df['pressure_24h'], color='tab:green', label='24h avg')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Pressure (hPa)')
axes[1, 0].set_title('Pressure Trend (2465m)')
axes[1, 0].legend()

# 4. Wind speed distribution computed from u and v components
wind_speed_from_uv = np.sqrt(df['u']**2 + df['v']**2)
axes[1, 1].hist(wind_speed_from_uv, bins=50, color='tab:orange', alpha=0.7, edgecolor='k')
axes[1, 1].set_xlabel('Wind Speed (m/s) — derived from u/v')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Wind Speed Distribution (from u/v components)')

plt.tight_layout()
plt.show()

In [None]:
df.head()

In [None]:
import os

# ===========================
# 6-d. EXPORT HOURLY CLEAN CSV
# ===========================
out_dir = '../../data/cleaned/'
os.makedirs(out_dir, exist_ok=True)
output_path = os.path.join(out_dir, 'bandarAbas_multi_var_wind_3hourly_cleaned.csv')
df.to_csv(output_path)

print("✅ Multi-variate 3hourly clean file saved to:", hourly_path)
print("Shape written:", df.shape)