# Assignment 3

In [2]:
import pandas as pd # Data manipulation and analysis.
import numpy as np # Numerical operations and array handling.

pd.set_option('display.max_columns', None) # Display all columns in DataFrame output.
pd.set_option('display.max_rows', None) # Display all rows in DataFrame output.

# Load data from dataset
df = pd.read_csv('data/clean_weather.csv')

# Display DataFrame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date_time              8784 non-null   object 
 1   temperature            8784 non-null   float64
 2   dew_point_temperature  8784 non-null   float64
 3   humidity               8784 non-null   int64  
 4   wind_speed             8784 non-null   int64  
 5   visibility             8784 non-null   float64
 6   pressure               8784 non-null   float64
 7   is_rain                8784 non-null   int64  
 8   dewpoint_spread        8784 non-null   float64
 9   year                   8784 non-null   int64  
 10  month                  8784 non-null   int64  
 11  season                 8784 non-null   int64  
 12  week                   8784 non-null   int64  
 13  day                    8784 non-null   int64  
 14  hour                   8784 non-null   int64  
dtypes: f

In [3]:
# Display the first 5 rows
df.head()

Unnamed: 0,date_time,temperature,dew_point_temperature,humidity,wind_speed,visibility,pressure,is_rain,dewpoint_spread,year,month,season,week,day,hour
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,0,2.1,2012,1,4,52,1,0
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,0,1.9,2012,1,4,52,1,1
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,0,1.6,2012,1,4,52,1,2
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,0,1.7,2012,1,4,52,1,3
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,0,1.8,2012,1,4,52,1,4


## 1. Feature Engineering

### 1.1 Numeric Feature Engineering

In [4]:
# difference between temperature and dew point
df["dewpoint_spread"] = (df["temperature"] - df["dew_point_temperature"]).round(1)

# Humidity-to-temperature ratio (Kelvin to avoid divide-by-zero)
df["humidity_temp_ratio"] = df["humidity"] / (df["temperature"] + 273.15)

# Wind speed squared
df["wind_speed_sq"] = (df["wind_speed"] ** 2).astype(float)

# Inverse visibility
df["visibility_inv"] = 1.0 / np.clip(df["visibility"], 0.1, None)

# Hour as cyclic features
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# Month as cyclic features
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

# Day/Night binary flag
df["is_night"] = ((df["hour"] < 6) | (df["hour"] >= 18)).astype(int)

df.filter(regex="dewpoint_spread|humidity_temp_ratio|wind_speed_sq|visibility_inv|hour_|month_|is_night").head()


Unnamed: 0,dewpoint_spread,humidity_temp_ratio,wind_speed_sq,visibility_inv,hour_sin,hour_cos,month_sin,month_cos,is_night
0,2.1,0.316934,16.0,0.125,0.0,1.0,0.5,0.866025,1
1,1.9,0.320619,16.0,0.125,0.258819,0.965926,0.5,0.866025,1
2,1.6,0.32799,49.0,0.25,0.5,0.866025,0.5,0.866025,1
3,1.7,0.323946,36.0,0.25,0.707107,0.707107,0.5,0.866025,1
4,1.8,0.323946,49.0,0.208333,0.866025,0.5,0.5,0.866025,1


* dewpoint_spread: A smaller spread usually means higher chance of rain.
* humidity_temp_ratio: Higher humidity relative to temperature often indicates moist air → possible rain.
* wind_speed_sq: Sometimes wind-related energy or turbulence has quadratic influence.
* visibility_inv: Lower visibility (fog, rain) means higher inverse value.
* hour_sin/hour_cos: Transform hour (0–23) into sine/cosine form to represent daily periodicity.
* month_sin/month_cos: Similar to hour, months repeat every 12 → encode seasonality smoothly.
* is_night: Simple helper: 1 if nighttime (before 6am or after 6pm), else 0.

### 1.2 One-Hot-Encoding