# Title: Machine Learning and Statistical Modeling in Atmospheric Studies

Description: Hands-on Python notebook for predictive modeling of atmospheric variables using linear regression, random forests, SVM, and feature importance analysis.


In [None]:
# =======================
# 1. Import Required Libraries
# =======================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [None]:
# =======================
# 2. Generate Synthetic Atmospheric Data
# =======================
np.random.seed(42)
n_samples = 200

temperature = np.random.normal(25, 3, n_samples)
humidity = np.random.uniform(40, 80, n_samples)
no2 = np.random.normal(30, 5, n_samples)
ozone = 0.4*temperature - 0.3*humidity + 0.5*no2 + np.random.normal(0, 2, n_samples)

df = pd.DataFrame({'Temperature': temperature, 'Humidity': humidity, 'NO2': no2, 'Ozone': ozone})
df.head()


In [None]:
# =======================
# 3. Prepare Data for Modeling
# =======================
X = df[['Temperature','Humidity','NO2']]
y = df['Ozone']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# =======================
# 4. Linear Regression
# =======================
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_lr_pred = lr.predict(X_test_scaled)

print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_lr_pred)))
print("Linear Regression R²:", r2_score(y_test, y_lr_pred))

In [None]:
# =======================
# 5. Random Forest Regression
# =======================
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_rf_pred = rf.predict(X_test_scaled)

print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_rf_pred)))
print("Random Forest R²:", r2_score(y_test, y_rf_pred))

# Cross-validation for Random Forest
scores = cross_val_score(rf, X_train_scaled, y_train, cv=5, scoring='r2')
print("Random Forest cross-validated R²:", scores)
print("Mean R²:", scores.mean())

In [None]:
# =======================
# 6. Support Vector Regression
# =======================
svr = SVR(kernel='rbf')
svr.fit(X_train_scaled, y_train)
y_svr_pred = svr.predict(X_test_scaled)

print("SVM RMSE:", np.sqrt(mean_squared_error(y_test, y_svr_pred)))
print("SVM R²:", r2_score(y_test, y_svr_pred))


In [None]:
# =======================
# 7. Feature Importance from Random Forest
# =======================
importance = rf.feature_importances_
features = X.columns

plt.figure(figsize=(7,5))
plt.bar(features, importance)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
plt.show()


In [None]:
# =======================
# 8. Visualize Observed vs Predicted Ozone
# =======================
plt.figure(figsize=(7,5))
plt.scatter(y_test, y_rf_pred, color='green', label='Random Forest')
plt.scatter(y_test, y_lr_pred, color='blue', label='Linear Regression', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
plt.xlabel('Observed Ozone')
plt.ylabel('Predicted Ozone')
plt.title('Observed vs Predicted Ozone')
plt.legend()
plt.show()

In [None]:
# =======================
# 9. Summary
# =======================
print("""
Summary:
- Prepared synthetic atmospheric dataset with Temperature, Humidity, NO2 as predictors and Ozone as target.
- Applied Linear Regression, Random Forest, and Support Vector Regression.
- Evaluated models using RMSE and R²; cross-validation for Random Forest.
- Visualized feature importance to identify key drivers of ozone prediction.
- Observed vs predicted plots allow visual assessment of model performance.
- Students can adapt this notebook to real atmospheric datasets for predictive modeling.
""")