# Title: Best Practices and Reproducibility in Atmospheric Data Analysis

Description: Hands-on notebook to practice reproducible workflows for atmospheric data analysis.


In [None]:
# =======================
# 1. Import Libraries
# =======================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json

In [None]:
# =======================
# 2. Set Random Seed
# =======================
# Ensures reproducibility of random operations
np.random.seed(42)

In [None]:
# =======================
# 3. Define Folder Structure
# =======================
folders = ["data/raw", "data/processed", "results/plots", "results/tables"]
for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("Folder structure created:", folders)

In [None]:
# =======================
# 4. Load Raw Data (Example CSV)
# =======================
# For practice, we'll create synthetic ozone data
n_samples = 200
temperature = np.random.normal(25, 3, n_samples)
humidity = np.random.uniform(40, 80, n_samples)
no2 = np.random.normal(30, 5, n_samples)
ozone = 0.4*temperature - 0.3*humidity + 0.5*no2 + np.random.normal(0, 2, n_samples)

df_raw = pd.DataFrame({
    'Temperature': temperature,
    'Humidity': humidity,
    'NO2': no2,
    'Ozone': ozone
})

# Save raw data
raw_file_path = "data/raw/ozone_data.csv"
df_raw.to_csv(raw_file_path, index=False)
print(f"Raw data saved to {raw_file_path}")

In [None]:
# =======================
# 5. Preprocessing Pipeline
# =======================
# Drop missing values (if any)
df_processed = df_raw.dropna()

# Log-transform Ozone (example preprocessing)
df_processed['Ozone_log'] = np.log(df_processed['Ozone'])

# Save processed data
processed_file_path = "data/processed/ozone_data_processed.csv"
df_processed.to_csv(processed_file_path, index=False)
print(f"Processed data saved to {processed_file_path}")

In [None]:

# =======================
# 6. Basic Analysis and Visualization
# =======================
mean_ozone = df_processed['Ozone_log'].mean()
std_ozone = df_processed['Ozone_log'].std()
print(f"Mean log(Ozone): {mean_ozone:.3f}, Std: {std_ozone:.3f}")

# Histogram
plt.figure(figsize=(7,5))
plt.hist(df_processed['Ozone_log'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Log(Ozone)')
plt.ylabel('Frequency')
plt.title('Histogram of Log-transformed Ozone')
plot_file_path = "results/plots/histogram_log_ozone.png"
plt.savefig(plot_file_path)
plt.show()
print(f"Plot saved to {plot_file_path}")

In [None]:
# =======================
# 7. Metadata Storage
# =======================
metadata = {
    "raw_data_file": raw_file_path,
    "processed_data_file": processed_file_path,
    "plot_file": plot_file_path,
    "variables": {
        "Temperature": "deg C",
        "Humidity": "%",
        "NO2": "ppb",
        "Ozone": "ppb"
    },
    "preprocessing": [
        "Dropped missing values",
        "Log-transform Ozone"
    ],
    "random_seed": 42
}

metadata_file_path = "results/tables/metadata.json"
with open(metadata_file_path, "w") as f:
    json.dump(metadata, f, indent=4)
print(f"Metadata saved to {metadata_file_path}")

In [None]:
# =======================
# 8. Summary of Reproducible Workflow
# =======================
print("""
Summary:
- Data loaded from raw CSV and stored in 'data/raw'.
- Preprocessing applied: drop missing values, log-transform Ozone.
- Processed data saved in 'data/processed'.
- Histogram plot saved in 'results/plots'.
- Metadata recorded in 'results/tables/metadata.json' for reproducibility.
- Folder structure, file naming, and random seed ensure reproducibility.
""")