In [5]:
!pip install folium
!pip install xgboost
!pip install geopandas

Collecting folium
  Downloading folium-0.19.4-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting branca>=0.6.0 (from folium)
  Downloading branca-0.8.1-py3-none-any.whl.metadata (1.5 kB)
Downloading folium-0.19.4-py2.py3-none-any.whl (110 kB)
Downloading branca-0.8.1-py3-none-any.whl (26 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.8.1 folium-0.19.4
Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 8.5 MB/s eta 0:00:15
    --------------------------------------- 1.8/124.9 MB 6.3 MB/s eta 0:00:20
    --------------------------------------- 2.9/124.9 MB 5.6 MB/s eta 0:00:22
   - -------------------------------------- 4.2/124.9 MB 5.7 MB/s eta 0:00:22
   - -------------------------------------- 5.8/124.9 MB 6.1 MB/s eta 0:00:20
   

In [6]:
!pip install --upgrade geopandas



In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import geopandas as gpd
from shapely.geometry import Point
import warnings
import os

warnings.filterwarnings('ignore')
df = pd.read_csv('GlobalWeatherRepository.csv', parse_dates=['last_updated'])
print("Initial Data Shape:", df.shape)

imputer = SimpleImputer(strategy='median')
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = imputer.fit_transform(df[num_cols])

Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print("Cleaned Data Shape:", df.shape)

plt.figure(figsize=(12,6))
sns.lineplot(x='last_updated', y='temperature_celsius', data=df, hue='country')
plt.title('Temperature Trends Across Countries')
plt.savefig('temperature_trends.png')
plt.close()

plt.figure(figsize=(10,6))
sns.heatmap(df[['temperature_celsius', 'precip_mm', 'humidity', 'wind_kph']].corr(), annot=True)
plt.title('Feature Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

iso_forest = IsolationForest(contamination=0.05)
anomalies = iso_forest.fit_predict(df[num_cols])
df['anomaly'] = anomalies
anomaly_data = df[df['anomaly'] == -1]
print(f"Detected {len(anomaly_data)} anomalies")

ts_data = df.set_index('last_updated')[['temperature_celsius']].resample('D').mean().ffill()

arima_model = ARIMA(ts_data, order=(5,1,0))
arima_results = arima_model.fit()
arima_pred = arima_results.get_forecast(steps=30).predicted_mean

sarimax_model = SARIMAX(ts_data,
                        order=(5,1,0),
                        seasonal_order=(1,1,1,7),
                        enforce_stationarity=False,
                        enforce_invertibility=False)
sarimax_results = sarimax_model.fit(disp=False)
sarimax_pred = sarimax_results.get_forecast(steps=30).predicted_mean

tscv = TimeSeriesSplit(n_splits=5)
arima_scores = []
sarimax_scores = []

for train_index, test_index in tscv.split(ts_data):
    train, test = ts_data.iloc[train_index], ts_data.iloc[test_index]
    arima_model = ARIMA(train, order=(5,1,0)).fit()
    arima_pred = arima_model.forecast(steps=len(test))
    arima_scores.append(mean_absolute_error(test, arima_pred))
    sarimax_model = SARIMAX(train,order=(5,1,0),seasonal_order=(1,1,1,7)).fit(disp=False)
    sarimax_pred = sarimax_model.get_forecast(steps=len(test)).predicted_mean
    sarimax_scores.append(mean_absolute_error(test, sarimax_pred))

print(f"ARIMA MAE: {np.mean(arima_scores):.2f}")
print(f"SARIMAX MAE: {np.mean(sarimax_scores):.2f}")

df['year'] = df['last_updated'].dt.year
climate_trends = df.groupby(['country', 'year'])[['temperature_celsius']].mean().unstack()
climate_trends.T.plot(figsize=(12,6))
plt.title('Long-term Climate Trends')
plt.savefig('climate_trends.png')
plt.close()

air_quality_corr = df[['air_quality_Carbon_Monoxide', 'air_quality_Nitrogen_dioxide', 'temperature_celsius', 'humidity']].select_dtypes(include=np.number).corr()
sns.heatmap(air_quality_corr, annot=True)
plt.title('Air Quality & Weather Correlations')
plt.savefig('air_quality_corr.png')
plt.close()

model = xgb.XGBRegressor()
X = df.drop(['temperature_celsius', 'last_updated', 'anomaly'], axis=1).select_dtypes(include=np.number)
y = df['temperature_celsius']
model.fit(X, y)

feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Important Features for Temperature Prediction')
plt.savefig('feature_importance.png')
plt.close()

geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
geo_df = gpd.GeoDataFrame(df, geometry=geometry)
os.environ['SHAPE_RESTORE_SHX'] = 'YES'
world = gpd.read_file(r'C:/Users/bhave/ne_110m_admin_0_countries.shp')
base = world.plot(color='white', edgecolor='black', figsize=(15, 10))
geo_df.plot(ax=base, marker='o', column='temperature_celsius', legend=True, markersize=10, cmap='coolwarm')
plt.title('Global Temperature Distribution')
plt.savefig('global_temperature_map.png')
plt.close()

print("Analysis Complete! Check saved visualizations.")

Initial Data Shape: (51060, 41)
Cleaned Data Shape: (21605, 41)
Detected 1081 anomalies
ARIMA MAE: 0.23
SARIMAX MAE: 0.31
Analysis Complete! Check saved visualizations.
