In [20]:
# Libraries
import pandas as pd
import numpy as np
import os # Library for interacting with the operating system


# Visualizations
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

## 1. Loading Data

In [21]:
# load dataset
weather_data = pd.read_csv('./dataset/weather_aveiro_final.csv')

## 2. Data Preprocessing and Cleaning

In [22]:
# show general information about the dataset
weather_info = weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46608 entries, 0 to 46607
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Time                 46608 non-null  object 
 1   Avg_Temp             46602 non-null  float64
 2   Avg_Rel_Humidity     46602 non-null  float64
 3   Avg_Wind_Direction   46608 non-null  float64
 4   Avg_Wind_Speed       46608 non-null  float64
 5   Max_Inst_Wind_Speed  46608 non-null  float64
 6   Inst_Temp            46603 non-null  float64
 7   Quantity_Precip      46608 non-null  float64
 8   Max_Inst_Precip      46608 non-null  float64
 9   Total_Global_Rad     46603 non-null  float64
dtypes: float64(9), object(1)
memory usage: 3.6+ MB


In [23]:
# select the first 8000 lines:
weather_data = weather_data.iloc[:8000].reset_index(drop=True)

In [24]:
print(f"Dataset contains {weather_data.shape[0]} rows and {weather_data.shape[1]} columns.")
weather_data.head()

Dataset contains 8000 rows and 10 columns.


Unnamed: 0,Time,Avg_Temp,Avg_Rel_Humidity,Avg_Wind_Direction,Avg_Wind_Speed,Max_Inst_Wind_Speed,Inst_Temp,Quantity_Precip,Max_Inst_Precip,Total_Global_Rad
0,2022-05-05 12:00:00,22.7,62.0,305.0,3.8,5.4,24.4,0.0,0.0,526.0
1,2022-05-05 12:15:00,22.55,63.5,311.5,4.05,5.7,24.55,0.0,0.0,523.35
2,2022-05-05 12:30:00,22.1,64.0,311.0,4.1,6.0,24.8,0.0,0.0,520.0
3,2022-05-05 12:45:00,21.95,62.5,300.0,4.45,6.05,24.9,0.0,0.0,514.4
4,2022-05-05 13:00:00,22.2,62.0,314.0,4.0,5.2,24.9,0.0,0.0,507.0


In [25]:
# Check unique values for categorical columns in client data
for col in weather_data.select_dtypes(include=["object", "category"]).columns:
    print(f"{col} has {weather_data[col].nunique()} unique values: {weather_data[col].unique()[:5]}", '\n')

Time has 8000 unique values: ['2022-05-05 12:00:00' '2022-05-05 12:15:00' '2022-05-05 12:30:00'
 '2022-05-05 12:45:00' '2022-05-05 13:00:00'] 



---
### 2.1. Handling Missing Data

In [26]:
# check for missing data
print(weather_data.isnull().sum())

Time                   0
Avg_Temp               0
Avg_Rel_Humidity       0
Avg_Wind_Direction     0
Avg_Wind_Speed         0
Max_Inst_Wind_Speed    0
Inst_Temp              0
Quantity_Precip        0
Max_Inst_Precip        0
Total_Global_Rad       0
dtype: int64


### 2.2 Transformation

`Transform date columns to datetime type`

In [27]:
date_cols = ['Time']

for col in date_cols:
    weather_data[col] = pd.to_datetime(weather_data[col], errors='coerce') 
print(weather_data[col])

0      2022-05-05 12:00:00
1      2022-05-05 12:15:00
2      2022-05-05 12:30:00
3      2022-05-05 12:45:00
4      2022-05-05 13:00:00
               ...        
7995   2022-07-27 18:45:00
7996   2022-07-27 19:00:00
7997   2022-07-27 19:15:00
7998   2022-07-27 19:30:00
7999   2022-07-27 19:45:00
Name: Time, Length: 8000, dtype: datetime64[ns]


In [28]:
# Saving cleaned dataset
directory_name = 'dataset'
output_filename = ' weather_cleaned.csv'

# Build the full path
output_path = os.path.join(directory_name, output_filename)

# Save the DataFrame to a CSV file
weather_data.to_csv(output_path, index=False)
