In [116]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler

In [117]:
#data loading
url = "/content/chlimate_history.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,Partly cloudy throughout the day.


In [118]:
data.isnull().sum()

Unnamed: 0,0
Formatted Date,0
Summary,0
Precip Type,517
Temperature (C),0
Apparent Temperature (C),0
Humidity,0
Wind Speed (km/h),0
Wind Bearing (degrees),0
Visibility (km),0
Loud Cover,0


In [119]:
#duplication
duplicate_rows=data[data.duplicated()]
print(duplicate_rows.head())
data_cleaned=data.drop_duplicates()
print(data_cleaned.head())

                      Formatted Date Summary Precip Type  Temperature (C)  \
36072  2010-08-02 00:00:00.000 +0200   Clear        rain        18.800000   
36073  2010-08-02 01:00:00.000 +0200   Clear        rain        18.222222   
36074  2010-08-02 02:00:00.000 +0200   Clear        rain        18.072222   
36075  2010-08-02 03:00:00.000 +0200   Clear        rain        16.622222   
36076  2010-08-02 04:00:00.000 +0200   Clear        rain        16.094444   

       Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
36072                 18.800000      0.93             6.2790   
36073                 18.222222      0.97             6.2790   
36074                 18.072222      0.98            11.2700   
36075                 16.622222      0.99             6.4400   
36076                 16.094444      0.99             3.0751   

       Wind Bearing (degrees)  Visibility (km)  Loud Cover  \
36072                     270          14.9086           0   
36073                     29

In [120]:
# Label encode categorical columns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_cleaned.loc[:, 'Precip Type']   = le.fit_transform(data_cleaned['Precip Type'].astype(str))
data_cleaned.loc[:, 'Daily Summary'] = le.fit_transform(data_cleaned['Daily Summary'].astype(str))
data_cleaned.loc[:, 'Summary']       = le.fit_transform(data_cleaned['Summary'].astype(str))

data_cleaned.loc[:, 'Formatted Date'] = pd.to_datetime(data_cleaned['Formatted Date'], utc=True)


In [121]:

data_cleaned["Precip Type"]=data_cleaned["Precip Type"].fillna(data_cleaned["Precip Type"].mean())
data_cleaned.dtypes

  data_cleaned["Precip Type"]=data_cleaned["Precip Type"].fillna(data_cleaned["Precip Type"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned["Precip Type"]=data_cleaned["Precip Type"].fillna(data_cleaned["Precip Type"].mean())


Unnamed: 0,0
Formatted Date,object
Summary,object
Precip Type,int64
Temperature (C),float64
Apparent Temperature (C),float64
Humidity,float64
Wind Speed (km/h),float64
Wind Bearing (degrees),int64
Visibility (km),float64
Loud Cover,int64


In [122]:
#remove outlayers(z_score)
from scipy.stats import zscore

data_cleaned.reset_index(drop=True, inplace=True)  # Reset index before calculating z-score

numeric_cols=data_cleaned.select_dtypes(include=[np.number]).columns
z_score=zscore(data_cleaned[numeric_cols])
z_score_df=pd.DataFrame(z_score,columns=numeric_cols)
outliers=(np.abs(z_score_df)>3).any(axis=1)
data_cleaned=data_cleaned[~outliers]
data_cleaned.reset_index(drop=True,inplace=True)

print(data_cleaned.head())
print("Original shape:",data.shape)
print("Cleaned shape:",data_cleaned.shape)

              Formatted Date Summary  Precip Type  Temperature (C)  \
0  2006-03-31 22:00:00+00:00      19            1         9.472222   
1  2006-03-31 23:00:00+00:00      19            1         9.355556   
2  2006-04-01 00:00:00+00:00      17            1         9.377778   
3  2006-04-01 01:00:00+00:00      19            1         8.288889   
4  2006-04-01 02:00:00+00:00      17            1         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                     251          15.8263           0               1015.13   
1                     259          15.8263           0    

In [123]:
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

X = data_cleaned.drop(columns=['Daily Summary', 'Formatted Date'])
y = data_cleaned['Daily Summary']

mi_scores = mutual_info_regression(X, y)
selected_data = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})
selected_data = selected_data.sort_values('MI Score', ascending=False)

print(selected_data.head(10))

                    Feature  MI Score
0                   Summary  0.478124
7           Visibility (km)  0.358178
2           Temperature (C)  0.307883
3  Apparent Temperature (C)  0.272411
9      Pressure (millibars)  0.215796
4                  Humidity  0.213040
5         Wind Speed (km/h)  0.090260
6    Wind Bearing (degrees)  0.089479
1               Precip Type  0.068048
8                Loud Cover  0.000000


In [124]:
data_cleaned['Summary'] = le.fit_transform(data_cleaned['Summary'])

In [125]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Label encode categorical features
le = LabelEncoder()
for col in ['Precip Type', 'Daily Summary', 'Summary']:
    if col in data.columns:
        data[col] = le.fit_transform(data[col].astype(str))

# Select only numeric columns
selected_data = data.select_dtypes(include=['int64', 'float64'])

# Scale
scaler = StandardScaler()
new_data_array = scaler.fit_transform(selected_data)
new_data = pd.DataFrame(new_data_array, columns=selected_data.columns)

print(new_data.head())


    Summary  Precip Type  Temperature (C)  Apparent Temperature (C)  Humidity  \
0  0.686460    -0.325812        -0.257599                 -0.324035  0.793470   
1  0.686460    -0.325812        -0.269814                 -0.339097  0.639996   
2  0.227899    -0.325812        -0.267487                 -0.138102  0.793470   
3  0.686460    -0.325812        -0.381489                 -0.459071  0.486521   
4  0.227899    -0.325812        -0.332631                 -0.362469  0.486521   

   Wind Speed (km/h)  Wind Bearing (degrees)  Visibility (km)  Loud Cover  \
0           0.478635                0.591256         1.306976         0.0   
1           0.499594                0.665756         1.306976         0.0   
2          -0.995473                0.153570         1.099586         0.0   
3           0.476306                0.758881         1.306976         0.0   
4           0.033841                0.665756         1.306976         0.0   

   Pressure (millibars)  Daily Summary  
0        

In [126]:
import numpy as np

def create_sequences(data, target_col, seq_length):

    X, y = [], []

    target = new_data['Daily Summary'].values
    features = new_data.drop(columns=['Daily Summary']).values

    for i in range(len(new_data) - seq_length):
        X.append(features[i:i+seq_length])
        y.append(target[i+seq_length])

    return np.array(X), np.array(y)
seq_length = 5

In [127]:
X_np, y_np = create_sequences(new_data, target_col='Daily Summary', seq_length=seq_length)

# Convert to tensors
X = torch.tensor(X_np, dtype=torch.float32)  # (samples, seq_length, num_features)
y = torch.tensor(y_np, dtype=torch.float32)  # (samples,)

# Reshape y if needed (for regression loss)
y = y.view(-1, 1)  # (samples, 1)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: torch.Size([96448, 5, 10])
y shape: torch.Size([96448, 1])


In [128]:
# model
class RNNModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=10, output_size=1):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)  # RNN processes sequence
        out = out[:, -1, :]   # take output at last timestep
        out = self.fc(out)    # map hidden → prediction
        return out

In [132]:
# create a object for model
model = RNNModel(input_size=10, hidden_size=10, output_size=1)

In [133]:
# loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [134]:
epochs = 200
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:  # fixed: epoch+1
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [20/200], Loss: 0.7155
Epoch [40/200], Loss: 0.6810
Epoch [60/200], Loss: 0.6644
Epoch [80/200], Loss: 0.6515
Epoch [100/200], Loss: 0.6392
Epoch [120/200], Loss: 0.6227
Epoch [140/200], Loss: 0.5980
Epoch [160/200], Loss: 0.5902
Epoch [180/200], Loss: 0.5863
Epoch [200/200], Loss: 0.5840


In [137]:
with torch.no_grad():
    # take the last row (last day’s features) from your dataset
    test_input = torch.tensor(new_data.drop(columns=["Daily Summary"]).values[-1],
                              dtype=torch.float32).view(1, 1, -1)  # (1,1,9)

    prediction = model(test_input)   # model output
    print("\nPredicted Daily Summary:", prediction.item())



Predicted Daily Summary: 0.7436408996582031
