In [1]:
import os
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('weather_data.csv')

df.head()

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.23354
1,San Diego,2024-05-17 15:22:10,8.73414,58.319107,9.111623,27.715161
2,San Diego,2024-05-11 09:30:59,11.632436,38.820175,4.607511,28.732951
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.18372,26.367303
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622


In [3]:
# Extracting year, month, day, hour, minute, second from the 'Date_Time' column

df['Date_Time'] = pd.to_datetime(df['Date_Time'])
df['Year'] = df['Date_Time'].dt.year
df['Month'] = df['Date_Time'].dt.month
df['Day'] = df['Date_Time'].dt.day
df['Hour'] = df['Date_Time'].dt.hour
df['Minute'] = df['Date_Time'].dt.minute
df['Second'] = df['Date_Time'].dt.second

df.head()

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,Year,Month,Day,Hour,Minute,Second
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.23354,2024,1,14,21,12,46
1,San Diego,2024-05-17 15:22:10,8.73414,58.319107,9.111623,27.715161,2024,5,17,15,22,10
2,San Diego,2024-05-11 09:30:59,11.632436,38.820175,4.607511,28.732951,2024,5,11,9,30,59
3,Philadelphia,2024-02-26 17:32:39,-8.628976,54.074474,3.18372,26.367303,2024,2,26,17,32,39
4,San Antonio,2024-04-29 13:23:51,39.808213,72.899908,9.598282,29.898622,2024,4,29,13,23,51


In [4]:
df.drop('Date_Time', axis=1, inplace=True)

df.head()

Unnamed: 0,Location,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,Year,Month,Day,Hour,Minute,Second
0,San Diego,10.683001,41.195754,4.020119,8.23354,2024,1,14,21,12,46
1,San Diego,8.73414,58.319107,9.111623,27.715161,2024,5,17,15,22,10
2,San Diego,11.632436,38.820175,4.607511,28.732951,2024,5,11,9,30,59
3,Philadelphia,-8.628976,54.074474,3.18372,26.367303,2024,2,26,17,32,39
4,San Antonio,39.808213,72.899908,9.598282,29.898622,2024,4,29,13,23,51


In [5]:
df.isna().sum()

Location            0
Temperature_C       0
Humidity_pct        0
Precipitation_mm    0
Wind_Speed_kmh      0
Year                0
Month               0
Day                 0
Hour                0
Minute              0
Second              0
dtype: int64

In [6]:
(df == 0).sum()

Location                0
Temperature_C           0
Humidity_pct            0
Precipitation_mm        0
Wind_Speed_kmh          0
Year                    0
Month                   0
Day                     0
Hour                41769
Minute              16616
Second              16519
dtype: int64

In [7]:
# Cyclical Encoding for Hour, Minute, Second

df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
df['Minute_sin'] = np.sin(2 * np.pi * df['Minute'] / 60)
df['Minute_cos'] = np.cos(2 * np.pi * df['Minute'] / 60)
df['Second_sin'] = np.sin(2 * np.pi * df['Second'] / 60)
df['Second_cos'] = np.cos(2 * np.pi * df['Second'] / 60)

df = df.drop(columns=['Hour', 'Minute', 'Second'])

In [8]:
cols = df.columns.tolist()
new_order = ['Location', 'Year', 'Month', 'Day', 'Hour_sin', 'Hour_cos', 'Minute_sin', 'Minute_cos', 'Second_sin', 'Second_cos', 'Humidity_pct', 'Precipitation_mm', 'Wind_Speed_kmh', 'Temperature_C']
df = df[new_order]

df.head()

Unnamed: 0,Location,Year,Month,Day,Hour_sin,Hour_cos,Minute_sin,Minute_cos,Second_sin,Second_cos,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,Temperature_C
0,San Diego,2024,1,14,-0.707107,0.707107,0.9510565,0.309017,-0.994522,0.104528,41.195754,4.020119,8.23354,10.683001
1,San Diego,2024,5,17,-0.707107,-0.707107,0.7431448,-0.669131,0.866025,0.5,58.319107,9.111623,27.715161,8.73414
2,San Diego,2024,5,11,0.707107,-0.707107,5.665539e-16,-1.0,-0.104528,0.994522,38.820175,4.607511,28.732951,11.632436
3,Philadelphia,2024,2,26,-0.965926,-0.258819,-0.2079117,-0.978148,-0.809017,-0.587785,54.074474,3.18372,26.367303,-8.628976
4,San Antonio,2024,4,29,-0.258819,-0.965926,0.6691306,-0.743145,-0.809017,0.587785,72.899908,9.598282,29.898622,39.808213


In [9]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [10]:
print(X)

[['San Diego' 2024 1 ... 41.195753566944475 4.02011871570867
  8.233540246873023]
 ['San Diego' 2024 5 ... 58.31910739552024 9.111623448229375
  27.71516125689249]
 ['San Diego' 2024 5 ... 38.82017526915946 4.607511377146035
  28.732951288236187]
 ...
 ['New York' 2024 4 ... 62.20188442965286 3.9875580386419296
  0.4039090307166959]
 ['Chicago' 2024 5 ... 63.7032451997448 4.294324709830695
  6.326035560339499]
 ['New York' 2024 4 ... 43.80458378329061 1.8832923531385992
  15.363828427249876]]


In [11]:
print(y)

[10.68300109  8.73413978 11.63243631 ... 15.66446487 18.99999414
 10.72535108]


In [12]:
# One Hot Encoding for Location

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [13]:
print(X)

[[0.0 0.0 0.0 ... 41.195753566944475 4.02011871570867 8.233540246873023]
 [0.0 0.0 0.0 ... 58.31910739552024 9.111623448229375 27.71516125689249]
 [0.0 0.0 0.0 ... 38.82017526915946 4.607511377146035 28.732951288236187]
 ...
 [0.0 0.0 0.0 ... 62.20188442965286 3.9875580386419296 0.4039090307166959]
 [1.0 0.0 0.0 ... 63.7032451997448 4.294324709830695 6.326035560339499]
 [0.0 0.0 0.0 ... 43.80458378329061 1.8832923531385992 15.363828427249876]]


In [14]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [15]:
print(X_train)

[[0.0 0.0 0.0 ... 50.73378486226551 3.47026945749082 1.3844616307194113]
 [0.0 0.0 0.0 ... 49.29038641910036 3.4477912920022713 20.13108542674613]
 [0.0 0.0 1.0 ... 42.560561801863315 8.588925954173007 23.570678856422788]
 ...
 [0.0 0.0 0.0 ... 63.65586833075509 0.8339297344692165 15.036071317479935]
 [0.0 0.0 1.0 ... 75.21774047155175 7.124453936823965 18.885591119006953]
 [0.0 0.0 0.0 ... 37.02064172097492 5.11268003335717 22.69804336447279]]


In [16]:
print(X_test)

[[1.0 0.0 0.0 ... 37.93230549581252 6.110468383563461 1.251755636729217]
 [0.0 0.0 0.0 ... 39.45551140514186 0.5111550671369725 29.184609387815208]
 [0.0 0.0 0.0 ... 44.65824742408648 1.829484479933028 25.06543722696259]
 ...
 [0.0 0.0 0.0 ... 85.81451582140424 4.225593787286509 3.972653028828072]
 [0.0 0.0 0.0 ... 57.69959580294685 2.031724211317626 8.980153539286022]
 [0.0 0.0 0.0 ... 60.043250643535345 2.162369283126715 20.94372271454007]]


In [17]:
print(y_train)

[33.94163963  6.18000445 34.97853983 ... 10.47014347 25.57357099
 24.22000906]


In [18]:
print(y_test)

[33.94970111 33.4490377   4.63809316 ... 37.23948546 20.88551904
 -2.41527116]


In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)