In [19]:
import seaborn as sns
import matplotlib as plt
import pandas as pd
import numpy as np
import sklearn

In [20]:
dataset = pd.read_csv("weatherAUS.csv")
df=pd.DataFrame(dataset)
print(df)

              Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  \
0       2008-12-01   Albury     13.4     22.9       0.6          NaN   
1       2008-12-02   Albury      7.4     25.1       0.0          NaN   
2       2008-12-03   Albury     12.9     25.7       0.0          NaN   
3       2008-12-04   Albury      9.2     28.0       0.0          NaN   
4       2008-12-05   Albury     17.5     32.3       1.0          NaN   
...            ...      ...      ...      ...       ...          ...   
145455  2017-06-21    Uluru      2.8     23.4       0.0          NaN   
145456  2017-06-22    Uluru      3.6     25.3       0.0          NaN   
145457  2017-06-23    Uluru      5.4     26.9       0.0          NaN   
145458  2017-06-24    Uluru      7.8     27.0       0.0          NaN   
145459  2017-06-25    Uluru     14.9      NaN       0.0          NaN   

        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  \
0            NaN           W           44.0          W  ... 

In [21]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [22]:
#checking how much % of data is missing
(df.isnull().sum() / len(df)) * 100


Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         2.241853
RainTomorrow      2.245978
dtype: float64

In [23]:
#since some columns -> 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm' have more than 35% missing data , we will drop them.
df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)

#for numeric columns , we will replace missing values with median

num_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindSpeed9am', 'WindSpeed3pm',
            'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
            'Temp9am', 'Temp3pm', 'WindGustSpeed']

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


In [24]:
#for categorical columns , we replace missing values with mode
cat_cols = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [25]:
df = df.dropna(subset=['RainTomorrow'])

In [26]:
(df.isnull().sum() / len(df)) * 100


Date             0.0
Location         0.0
MinTemp          0.0
MaxTemp          0.0
Rainfall         0.0
WindGustDir      0.0
WindGustSpeed    0.0
WindDir9am       0.0
WindDir3pm       0.0
WindSpeed9am     0.0
WindSpeed3pm     0.0
Humidity9am      0.0
Humidity3pm      0.0
Pressure9am      0.0
Pressure3pm      0.0
Temp9am          0.0
Temp3pm          0.0
RainToday        0.0
RainTomorrow     0.0
dtype: float64

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
le = LabelEncoder()
encode_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']

for col in encode_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

In [29]:
print("Data preprocessing complete!")
print("Remaining missing values per column:\n", df.isnull().sum())
print("\nDataset shape:", df.shape)
df.head()

Data preprocessing complete!
Remaining missing values per column:
 Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

Dataset shape: (142193, 19)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,2,13.4,22.9,0.6,13,44.0,13,14,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0
1,2008-12-02,2,7.4,25.1,0.0,14,44.0,6,15,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0
2,2008-12-03,2,12.9,25.7,0.0,15,46.0,13,15,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0
3,2008-12-04,2,9.2,28.0,0.0,4,24.0,9,0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,0
4,2008-12-05,2,17.5,32.3,1.0,13,41.0,1,7,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0


In [31]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [33]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek  

df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

def get_season(month):
    if month in [12, 1, 2]:
        return 'Summer'
    elif month in [3, 4, 5]:
        return 'Autumn'
    elif month in [6, 7, 8]:
        return 'Winter'
    else:
        return 'Spring'

df['Season'] = df['Month'].apply(get_season)
df['Season'] = df['Season'].map({'Summer': 0, 'Autumn': 1, 'Winter': 2, 'Spring': 3})


In [34]:
df['TempRange'] = df['MaxTemp'] - df['MinTemp']         
df['MeanTemp'] = (df['MaxTemp'] + df['MinTemp']) / 2     
df['TempDiff_9am_3pm'] = df['Temp3pm'] - df['Temp9am']

In [35]:
df['HumidityDiff'] = df['Humidity3pm'] - df['Humidity9am']
df['PressureDiff'] = df['Pressure3pm'] - df['Pressure9am']

In [39]:
df = df.sort_values(['Location', 'Date'])

#ROLLING AVERAGES 
df['RollingRainfall_3d'] = df.groupby('Location')['Rainfall'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['RollingMaxTemp_3d'] = df.groupby('Location')['MaxTemp'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['RollingHumidity3pm_3d'] = df.groupby('Location')['Humidity3pm'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)

#LAG FEATURES
df['Lag1_Rainfall'] = df.groupby('Location')['Rainfall'].shift(1)
df['Lag1_Humidity3pm'] = df.groupby('Location')['Humidity3pm'].shift(1)

df.bfill(inplace=True)


# SPLITTING INTO TRAINING AND TEST SETS

In [40]:
X = df.drop('RainTomorrow', axis=1)   
y = df['RainTomorrow']                 


In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        
    random_state=39,      
    stratify=y            
)


In [42]:
print(X_train.shape)   
print(X_test.shape)


(113754, 34)
(28439, 34)
