In [66]:
import pandas as pd
import numpy as np

In [67]:
df = pd.read_csv('weather_data.csv')

In [68]:
df.head()

Unnamed: 0,time,temperature_2m,precipitation,relativehumidity_2m,windspeed_10m,winddirection_10m,surface_pressure,cloudcover,city
0,2026-01-17 00:00:00,7.4,0.1,91,9.7,129,1008.9,100,Edinburgh
1,2026-01-17 01:00:00,7.7,0.1,91,8.9,133,1009.0,100,Edinburgh
2,2026-01-17 02:00:00,7.9,0.1,91,7.9,133,1009.3,100,Edinburgh
3,2026-01-17 03:00:00,7.8,0.0,91,8.4,133,1008.7,100,Edinburgh
4,2026-01-17 04:00:00,7.8,0.0,91,7.4,133,1009.6,100,Edinburgh


In [69]:
df = df.rename(columns={'temperature_2m':'temperature','relativehumidity_2m': 'relative_humidity','windspeed_10m':'wind_speed','winddirection_10m':'wind_direction'})

In [70]:
df.shape

(174720, 9)

In [71]:
df.isna().sum()

time                 0
temperature          0
precipitation        0
relative_humidity    0
wind_speed           0
wind_direction       0
surface_pressure     0
cloudcover           0
city                 0
dtype: int64

In [72]:
df.duplicated().sum()

np.int64(0)

In [73]:
df.describe()

Unnamed: 0,temperature,precipitation,relative_humidity,wind_speed,wind_direction,surface_pressure,cloudcover
count,174720.0,174720.0,174720.0,174720.0,174720.0,174720.0,174720.0
mean,8.273214,0.100595,87.142857,10.002976,130.404762,1003.048214,93.714286
std,1.188363,0.247968,5.791313,4.015094,31.769206,11.431886,10.231367
min,4.6,0.0,70.0,2.7,59.0,986.5,51.0
25%,7.5,0.0,84.0,6.8,113.75,989.675,92.75
50%,8.3,0.0,88.0,9.05,133.5,1008.8,98.0
75%,8.9,0.1,91.0,13.25,151.25,1012.825,100.0
max,10.9,1.2,96.0,19.4,193.0,1017.8,100.0


In [74]:
#df['temp_1h'] = df['temperature'].shift(1)  
#df['temp_2h'] = df['temperature'].shift(2)  

In [75]:
df.head()

Unnamed: 0,time,temperature,precipitation,relative_humidity,wind_speed,wind_direction,surface_pressure,cloudcover,city
0,2026-01-17 00:00:00,7.4,0.1,91,9.7,129,1008.9,100,Edinburgh
1,2026-01-17 01:00:00,7.7,0.1,91,8.9,133,1009.0,100,Edinburgh
2,2026-01-17 02:00:00,7.9,0.1,91,7.9,133,1009.3,100,Edinburgh
3,2026-01-17 03:00:00,7.8,0.0,91,8.4,133,1008.7,100,Edinburgh
4,2026-01-17 04:00:00,7.8,0.0,91,7.4,133,1009.6,100,Edinburgh


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174720 entries, 0 to 174719
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   time               174720 non-null  object 
 1   temperature        174720 non-null  float64
 2   precipitation      174720 non-null  float64
 3   relative_humidity  174720 non-null  int64  
 4   wind_speed         174720 non-null  float64
 5   wind_direction     174720 non-null  int64  
 6   surface_pressure   174720 non-null  float64
 7   cloudcover         174720 non-null  int64  
 8   city               174720 non-null  object 
dtypes: float64(4), int64(3), object(2)
memory usage: 12.0+ MB


In [77]:
df['time']=pd.to_datetime(df['time'])

In [78]:
df['hour'] = df['time'].dt.hour
df['day'] = df['time'].dt.day
df['month'] = df['time'].dt.month

In [79]:
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 24)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 24)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 24)

In [80]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer


In [81]:
x = df.drop('precipitation', axis=1)
y = df['precipitation']

In [82]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,)

In [83]:
num_cols = x.select_dtypes(include=['float64','int64']).columns.tolist()
cat_cols = x.select_dtypes(include=['object']).columns.tolist()

In [84]:
preprocessor = ColumnTransformer([('num',StandardScaler(),num_cols),('cat',OneHotEncoder(),cat_cols)])
x_train_transformed = preprocessor.fit_transform(x_train)
x_test_transformed = preprocessor.transform(x_test)

In [85]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import recall_score, precision_score,accuracy_score, mean_squared_error, r2_score

In [86]:
x_train_transformed = x_train_transformed.toarray() if hasattr(x_train_transformed, "toarray") else x_train_transformed
x_test_transformed = x_test_transformed.toarray() if hasattr(x_test_transformed, "toarray") else x_test_transformed


In [87]:
if x_train_transformed.ndim == 1:
    x_train_transformed = x_train_transformed.reshape(-1, 1)
    x_test_transformed = x_test_transformed.reshape(-1, 1)

In [91]:
reg_model = LinearRegression()
reg_model.fit(x_train_transformed,y_train)
y_pred = reg_model.predict(x_test_transformed)
r2 = r2_score(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
print(r2, mse)


0.2901193240851828 0.04407007593754834


In [96]:
rid_model = Ridge(alpha=10,tol=0.001,solver='auto')
rid_model.fit(x_train_transformed,y_train)
y_pred = rid_model.predict(x_test_transformed)
r2 = r2_score(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
print(r2, mse)

0.29011941926819385 0.044070070028495355
