In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler

In [29]:
#data loading
data = pd.read_csv("/content/chlimate_history.csv")
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,Partly cloudy throughout the day.


In [3]:
data.isnull().sum()

Unnamed: 0,0
Formatted Date,0
Summary,0
Precip Type,517
Temperature (C),0
Apparent Temperature (C),0
Humidity,0
Wind Speed (km/h),0
Wind Bearing (degrees),0
Visibility (km),0
Loud Cover,0


In [4]:
#duplication
duplicate_rows=data[data.duplicated()]
print(duplicate_rows.head())
data_cleaned=data.drop_duplicates()
print(data_cleaned.head())

                      Formatted Date Summary Precip Type  Temperature (C)  \
36072  2010-08-02 00:00:00.000 +0200   Clear        rain        18.800000   
36073  2010-08-02 01:00:00.000 +0200   Clear        rain        18.222222   
36074  2010-08-02 02:00:00.000 +0200   Clear        rain        18.072222   
36075  2010-08-02 03:00:00.000 +0200   Clear        rain        16.622222   
36076  2010-08-02 04:00:00.000 +0200   Clear        rain        16.094444   

       Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
36072                 18.800000      0.93             6.2790   
36073                 18.222222      0.97             6.2790   
36074                 18.072222      0.98            11.2700   
36075                 16.622222      0.99             6.4400   
36076                 16.094444      0.99             3.0751   

       Wind Bearing (degrees)  Visibility (km)  Loud Cover  \
36072                     270          14.9086           0   
36073                     29

In [5]:
7# Label encode categorical columns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_cleaned.loc[:, 'Precip Type']   = le.fit_transform(data_cleaned['Precip Type'].astype(str))
data_cleaned.loc[:, 'Daily Summary'] = le.fit_transform(data_cleaned['Daily Summary'].astype(str))
data_cleaned.loc[:, 'Summary']       = le.fit_transform(data_cleaned['Summary'].astype(str))

data_cleaned.loc[:, 'Formatted Date'] = pd.to_datetime(data_cleaned['Formatted Date'], utc=True)
data_cleaned

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-03-31 22:00:00+00:00,19,1,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,197
1,2006-03-31 23:00:00+00:00,19,1,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,197
2,2006-04-01 00:00:00+00:00,17,1,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,197
3,2006-04-01 01:00:00+00:00,19,1,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,197
4,2006-04-01 02:00:00+00:00,17,1,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,197
...,...,...,...,...,...,...,...,...,...,...,...,...
96448,2016-09-09 17:00:00+00:00,19,1,26.016667,26.016667,0.43,10.9963,31,16.1000,0,1014.36,170
96449,2016-09-09 18:00:00+00:00,19,1,24.583333,24.583333,0.48,10.0947,20,15.5526,0,1015.16,170
96450,2016-09-09 19:00:00+00:00,19,1,22.038889,22.038889,0.56,8.9838,30,16.1000,0,1015.66,170
96451,2016-09-09 20:00:00+00:00,19,1,21.522222,21.522222,0.60,10.5294,20,16.1000,0,1015.95,170


In [17]:
data["Precip Type"] = data["Precip Type"].fillna(data["Precip Type"].mode()[0])


In [18]:
#remove outlayers(z_score)
from scipy.stats import zscore

data_cleaned.reset_index(drop=True, inplace=True)  # Reset index before calculating z-score

numeric_cols=data_cleaned.select_dtypes(include=[np.number]).columns
z_score=zscore(data_cleaned[numeric_cols])
z_score_df=pd.DataFrame(z_score,columns=numeric_cols)
outliers=(np.abs(z_score_df)>3).any(axis=1)
data_cleaned=data_cleaned[~outliers]
data_cleaned.reset_index(drop=True,inplace=True)

print(data_cleaned.head())
print("Original shape:",data.shape)
print("Cleaned shape:",data_cleaned.shape)

              Formatted Date Summary Precip Type  Temperature (C)  \
0  2006-03-31 22:00:00+00:00      19           1         9.472222   
1  2006-03-31 23:00:00+00:00      19           1         9.355556   
2  2006-04-01 00:00:00+00:00      17           1         9.377778   
3  2006-04-01 01:00:00+00:00      19           1         8.288889   
4  2006-04-01 02:00:00+00:00      17           1         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                     251          15.8263           0               1015.13   
1                     259          15.8263           0          

In [19]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.impute import SimpleImputer

# Features and target
X = data_cleaned.drop(columns=['Daily Summary', 'Formatted Date'])
y = data_cleaned['Daily Summary']   # already encoded

# Impute missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Select top k features (set k as needed)
k = 8
selector = SelectKBest(score_func=mutual_info_regression, k=k)
X_kbest = selector.fit_transform(X_imputed, y)

# Replace X with only selected features
X_new = X.iloc[:, selector.get_support()]
print(X_new.head())

  Summary  Temperature (C)  Apparent Temperature (C)  Humidity  \
0      19         9.472222                  7.388889      0.89   
1      19         9.355556                  7.227778      0.86   
2      17         9.377778                  9.377778      0.89   
3      19         8.288889                  5.944444      0.83   
4      17         8.755556                  6.977778      0.83   

   Wind Speed (km/h)  Wind Bearing (degrees)  Visibility (km)  \
0            14.1197                     251          15.8263   
1            14.2646                     259          15.8263   
2             3.9284                     204          14.9569   
3            14.1036                     269          15.8263   
4            11.0446                     259          15.8263   

   Pressure (millibars)  
0               1015.13  
1               1015.63  
2               1015.94  
3               1016.41  
4               1016.51  


In [20]:
from sklearn.preprocessing import StandardScaler

# Separate features and target
X_n = X_new
y_n = y  # already encoded

# Create two scalers
scaler_X = StandardScaler()
scaler_y = StandardScaler()

# Scale features and target separately
X_scaled = scaler_X.fit_transform(X_new)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)) # Reshape y here

In [21]:
def create_sequences(features, target, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(features) - seq_length):
        X_seq.append(features[i:i+seq_length])
        y_seq.append(target[i+seq_length])
    return np.array(X_seq), np.array(y_seq)

# Generate sequences
seq_length = 5
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)

# Print separately
print("X_seq (features sequences):")
print(X_seq[:2])   # show first 2 sequences
print("Shape:", X_seq.shape)

print("\nY_seq (target sequences):")
print(y_seq[:10])  # show first 10 target values
print("Shape:", y_seq.shape)

X_seq (features sequences):
[[[ 0.68011843 -0.27333169 -0.34440125  0.78848752  0.60968198
    0.5991347   1.29601145 -0.24378157]
  [ 0.68011843 -0.28557915 -0.35955426  0.63497709  0.63320448
    0.67364644  1.29601145 -0.17636871]
  [ 0.18504476 -0.2832463  -0.15733989  0.78848752 -1.0447342
    0.16137823  1.0888194  -0.13457275]
  [ 0.68011843 -0.39755596 -0.48025586  0.48146666  0.60706836
    0.76678611  1.29601145 -0.07120466]
  [ 0.18504476 -0.34856611 -0.38306756  0.48146666  0.11048215
    0.67364644  1.29601145 -0.05772209]]

 [[ 0.68011843 -0.28557915 -0.35955426  0.63497709  0.63320448
    0.67364644  1.29601145 -0.17636871]
  [ 0.18504476 -0.2832463  -0.15733989  0.78848752 -1.0447342
    0.16137823  1.0888194  -0.13457275]
  [ 0.68011843 -0.39755596 -0.48025586  0.48146666  0.60706836
    0.76678611  1.29601145 -0.07120466]
  [ 0.18504476 -0.34856611 -0.38306756  0.48146666  0.11048215
    0.67364644  1.29601145 -0.05772209]
  [ 0.68011843 -0.29957625 -0.37052714  0.583

In [22]:
# Convert to tensors
X = torch.tensor(X_seq, dtype=torch.float32)  # (samples, seq_length, num_features)
y = torch.tensor(y_seq, dtype=torch.float32)  # (samples,)

# Reshape y if needed (for regression loss)
y = y.view(-1, 1)  # (samples, 1)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: torch.Size([92740, 5, 8])
y shape: torch.Size([92740, 1])


In [23]:
# model
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=1, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # h0 and c0 are initialized automatically if not provided
        out, (hn, cn) = self.lstm(x)   # out: (batch, seq_len, hidden_size)
        out = self.fc(out[:, -1, :])   # use last timestep
        return out


In [24]:
# create a object for model
model = LSTMModel(input_size=8, hidden_size=50, output_size=1)

In [25]:
# loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
epochs = 200
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [20/200], Loss: 0.7881
Epoch [40/200], Loss: 0.6917
Epoch [60/200], Loss: 0.6610
Epoch [80/200], Loss: 0.6486
Epoch [100/200], Loss: 0.6405
Epoch [120/200], Loss: 0.6339
Epoch [140/200], Loss: 0.6269
Epoch [160/200], Loss: 0.6175
Epoch [180/200], Loss: 0.6040
Epoch [200/200], Loss: 0.5895


In [32]:
with torch.no_grad():
    # take the last sequence
    test_input = torch.tensor(
        X_scaled[-seq_length:], dtype=torch.float32
    ).view(1, seq_length, -1)

    # model prediction (scaled output)
    prediction = model(test_input).item()

    # step 1: inverse scale with scaler_y
    prediction = scaler_y.inverse_transform([[prediction]])[0][0]

    # step 2: round to nearest integer index
    pred_idx = int(round(prediction))

    # step 3: clamp to valid range of classes
    pred_idx = min(max(pred_idx, 0), len(le.classes_) - 1)

    # step 4: decode to original label
    predicted_label = le.inverse_transform([pred_idx])

    print("\nPredicted Daily Summary:", predicted_label[0])



Predicted Daily Summary: Windy and Partly Cloudy
