#Temperature Prediction Using Machine Learning

In [None]:
# =========================
# CELL 1: IMPORT LIBRARIES
# =========================
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [None]:
# =========================
# CELL 2: LOAD DATASET
# =========================
df=pd.read_csv("/content/weatherHistory.csv")
# Quick check
print("Rows,Cols:", df.shape)
print(df.columns.tolist())
print(df.head(3))


Rows,Cols: (96453, 12)
['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover', 'Pressure (millibars)', 'Daily Summary']
                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                   251.0          15.8263         0.0               1015.13   
1                   259.0     

In [None]:
print(df.isnull().sum())

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64


In [None]:
# =========================
# CELL 3: CLEAN & PREPROCESS
# =========================
df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)
# Parse datetime
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'].str.slice(0,19), errors='coerce')

# Fill missing numeric with median
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

# Fill missing object columns with 'unknown'
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for c in cat_cols:
    df[c] = df[c].fillna('unknown')

# Feature engineering from datetime
df['hour']    = df['Formatted Date'].dt.hour
df['day']     = df['Formatted Date'].dt.day
df['month']   = df['Formatted Date'].dt.month
df['year']    = df['Formatted Date'].dt.year
df['weekday'] = df['Formatted Date'].dt.weekday

# Drop text columns
drop_cols = ['Daily Summary','Summary','Formatted Date']
for c in drop_cols:
    if c in df.columns:
        df = df.drop(columns=[c])

# Encode categorical
if 'Precip Type' in df.columns:
    le = LabelEncoder()
    df['Precip Type'] = le.fit_transform(df['Precip Type'].astype(str))

# Encode any remaining object columns
for c in df.select_dtypes(include=['object']).columns:
    df[c] = LabelEncoder().fit_transform(df[c].astype(str))

# Separate X, y
TARGET = 'Temperature (C)'
X = df.drop(columns=[TARGET]).values.astype(float)
y = df[TARGET].values




In [None]:
# =========================
# CELL 4: TRAIN-TEST SPLIT & SCALING
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, shuffle=True
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [None]:
# =========================
# CELL 5: UTILITY FUNCTION TO MEASURE PREDICTION TIME
# =========================
def time_predict(model, X):
    _ = model.predict(X[:min(5, len(X))])  # warmup
    runs = 5
    t0 = time.perf_counter()
    for _ in range(runs):
        _ = model.predict(X)
    t1 = time.perf_counter()
    return (t1 - t0) / runs


In [None]:
# =========================
# CELL 6: BASELINE MODEL - KNN (BRUTE FORCE)
# =========================
print("\n--- Baseline: KNN (brute-force) ---")
knn_brute = KNeighborsRegressor(n_neighbors=5, algorithm='brute', n_jobs=1)

t0 = time.perf_counter()
knn_brute.fit(X_train_scaled, y_train)
train_time_knn_brute = time.perf_counter() - t0

predict_time_knn_brute = time_predict(knn_brute, X_test_scaled)
pred_brute = knn_brute.predict(X_test_scaled)

mae_brute = mean_absolute_error(y_test, pred_brute)
rmse_brute = np.sqrt(mean_squared_error(y_test, pred_brute))

print(f"Train time: {train_time_knn_brute:.4f}s")
print(f"Avg predict time: {predict_time_knn_brute:.4f}s")
print(f"MAE: {mae_brute:.4f}, RMSE: {rmse_brute:.4f}")


--- Baseline: KNN (brute-force) ---
Train time: 0.0022s
Avg predict time: 9.7187s
MAE: 1.0402, RMSE: 1.3924


In [None]:
# =========================
# CELL 8: PCA + KD-TREE (DIMENSION REDUCTION)
# =========================
print("\n--- Optimization: PCA + KD-Tree ---")
pca = PCA(n_components=0.95, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca  = pca.transform(X_test_scaled)
print("Original dims:", X_train_scaled.shape[1], " -> PCA dims:", X_train_pca.shape[1])

knn_pca_kd = KNeighborsRegressor(n_neighbors=5, algorithm='kd_tree',leaf_size=20, n_jobs=4)
t0 = time.perf_counter()
knn_pca_kd.fit(X_train_pca, y_train)
train_time_knn_pca = time.perf_counter() - t0

predict_time_knn_pca = time_predict(knn_pca_kd, X_test_pca)
pred_pca = knn_pca_kd.predict(X_test_pca)

mae_pca = mean_absolute_error(y_test, pred_pca)
rmse_pca = np.sqrt(mean_squared_error(y_test, pred_pca))

print(f"Train time: {train_time_knn_pca:.4f}s")
print(f"Avg predict time: {predict_time_knn_pca:.4f}s")
print(f"MAE: {mae_pca:.4f}, RMSE: {rmse_pca:.4f}")


--- Optimization: PCA + KD-Tree ---
Original dims: 13  -> PCA dims: 11
Train time: 0.2848s
Avg predict time: 5.2948s
MAE: 1.3600, RMSE: 1.8591


In [None]:
# =========================
# CELL 9: RANDOM FOREST (CONTROL MODEL)
# =========================
print("\n--- RandomForest (100 estimators) ---")
rf = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42, n_jobs=1)
t0 = time.perf_counter()
rf.fit(X_train_scaled, y_train)
train_time_rf = time.perf_counter() - t0

predict_time_rf = time_predict(rf, X_test_scaled)
pred_rf = rf.predict(X_test_scaled)

mae_rf = mean_absolute_error(y_test, pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))

print(f"Train time: {train_time_rf:.4f}s")
print(f"Avg predict time: {predict_time_rf:.4f}s")
print(f"MAE: {mae_rf:.4f}, RMSE: {rmse_rf:.4f}")


--- RandomForest (100 estimators) ---
Train time: 46.0160s
Avg predict time: 0.4709s
MAE: 0.0233, RMSE: 0.0589


In [None]:
# =========================
# CELL 10: RANDOM FOREST FAST VARIANT (REDUCED ESTIMATORS)
# =========================
print("\n--- RandomForest (20 estimators) ---")
rf_fast = RandomForestRegressor(n_estimators=20, max_depth=10, random_state=42, n_jobs=1)
t0 = time.perf_counter()
rf_fast.fit(X_train_scaled, y_train)
train_time_rf_fast = time.perf_counter() - t0

predict_time_rf_fast = time_predict(rf_fast, X_test_scaled)
pred_rf_fast = rf_fast.predict(X_test_scaled)

mae_rf_fast = mean_absolute_error(y_test, pred_rf_fast)
rmse_rf_fast = np.sqrt(mean_squared_error(y_test, pred_rf_fast))

print(f"Train time: {train_time_rf_fast:.4f}s")
print(f"Avg predict time: {predict_time_rf_fast:.4f}s")
print(f"MAE: {mae_rf_fast:.4f}, RMSE: {rmse_rf_fast:.4f}")


--- RandomForest (20 estimators) ---
Train time: 7.1802s
Avg predict time: 0.0617s
MAE: 0.0460, RMSE: 0.0970
