# Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
import tensorflow_data_validation as tfdv

# Data <br>
Source: https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package <br>
Feature definitions: http://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/antbartash/australian_rain/main/data/weatherAUS.csv')

print(data.shape)
data.head()

(145460, 23)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
data_stats = tfdv.generate_statistics_from_dataframe(data)
tfdv.visualize_statistics(data_stats)

# Data preprocessing

In [4]:
# dropping observations with no target value
print(data.shape)
drop_ids = data['RainTomorrow'].isna()
data = data.loc[~drop_ids].reset_index(drop=True)

print(data.shape)

(145460, 23)
(142193, 23)


In [5]:
# encoding RainToday and RainTomorrow values
data['RainToday'] = np.where(data['RainToday'].isna(), 'NA', data['RainToday'])
data['RainToday'] = np.where(data['RainToday'] == 'Yes', 1, 0)
data['RainTomorrow'] = np.where(data['RainTomorrow'] == 'Yes', 1, 0)

In [6]:
# adding the Month column and dropping date
data['Date'] = data['Date'].astype('datetime64[ns]')
data['Month'] = [x.month for x in data['Date']]

data.drop(columns=['Date'], inplace=True)

data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,0,0,12
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,,,17.2,24.3,0,0,12
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,0,0,12
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,16.0,1017.6,1012.8,,,18.1,26.5,0,0,12
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0,12


# Final statistics

In [7]:
final_stats = tfdv.generate_statistics_from_dataframe(data)
tfdv.visualize_statistics(final_stats)

# Encoding categorical features for LightGBM

In [8]:
data.columns

Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow', 'Month'],
      dtype='object')

In [9]:
categorical_features_ids = [0, 6, 8, 9]

transformer = ColumnTransformer([
    ('encoder', OrdinalEncoder(), categorical_features_ids)
], remainder='passthrough')

data_transformed = pd.DataFrame(
    transformer.fit_transform(data),
    columns=np.concatenate([
        data.iloc[:, categorical_features_ids].columns.values,
        data.select_dtypes(exclude=object).columns.values  
        ])
)

print(data_transformed.shape)
data_transformed.head()

(142193, 23)


Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,2.0,13.0,13.0,14.0,13.4,22.9,0.6,,,44.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0,12.0
1,2.0,14.0,6.0,15.0,7.4,25.1,0.0,,,44.0,...,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0,12.0
2,2.0,15.0,13.0,15.0,12.9,25.7,0.0,,,46.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0,12.0
3,2.0,4.0,9.0,0.0,9.2,28.0,0.0,,,24.0,...,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0,12.0
4,2.0,13.0,1.0,7.0,17.5,32.3,1.0,,,41.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0,12.0


In [10]:
data_transformed.describe()

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
count,142193.0,132863.0,132180.0,138415.0,141556.0,141871.0,140787.0,81350.0,74377.0,132923.0,...,138583.0,128179.0,128212.0,88536.0,85099.0,141289.0,139467.0,142193.0,142193.0,142193.0
mean,23.740655,7.712072,7.303351,7.755417,12.1864,23.226784,2.349974,5.469824,7.624853,39.984292,...,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235,0.221213,0.224181,6.402544
std,14.237503,4.663565,4.541175,4.603482,6.403283,7.117618,8.465173,4.188537,3.781525,13.588801,...,20.797772,7.105476,7.036677,2.887016,2.720633,6.492838,6.937594,0.415065,0.417043,3.426506
min,0.0,0.0,0.0,0.0,-8.5,-4.8,0.0,0.0,0.0,6.0,...,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,0.0,0.0,1.0
25%,11.0,3.0,3.0,4.0,7.6,17.9,0.0,2.6,4.9,31.0,...,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6,0.0,0.0,3.0
50%,24.0,8.0,7.0,8.0,12.0,22.6,0.0,4.8,8.5,39.0,...,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1,0.0,0.0,6.0
75%,36.0,12.0,11.0,12.0,16.8,28.2,0.8,7.4,10.6,48.0,...,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4,0.0,0.0,9.0
max,48.0,15.0,15.0,15.0,33.9,48.1,371.0,145.0,14.5,135.0,...,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,1.0,1.0,12.0


# Saving data

In [11]:
data_transformed.to_csv('data_transformed.csv')