In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import glob, os
from pprint import pprint

# change names here if your files use different filenames
search_names = ['weather_classification_data.csv', 'pakwheels_used_cars.csv']

found = {}
for name in search_names:
    matches = glob.glob(f'/content/drive/**/{name}', recursive=True)
    found[name] = matches

pprint(found)
# set variables if found
weather_paths = found.get('weather_classification_data.csv', [])
reg_paths = found.get('pakwheels_used_cars.csv', [])

if weather_paths:
    weather_path = weather_paths[0]
    print("Weather path:", weather_path)
if reg_paths:
    reg_path = reg_paths[0]
    print("Regression path:", reg_path)

# load if found
import pandas as pd
if weather_paths:
    df_weather = pd.read_csv(weather_path)
    print("Loaded weather:", df_weather.shape)
if reg_paths:
    df_reg = pd.read_csv(reg_path)
    print("Loaded pakwheels:", df_reg.shape)


{'pakwheels_used_cars.csv': ['/content/drive/MyDrive/pakwheels_used_cars.csv'],
 'weather_classification_data.csv': ['/content/drive/MyDrive/weather_classification_data.csv']}
Weather path: /content/drive/MyDrive/weather_classification_data.csv
Regression path: /content/drive/MyDrive/pakwheels_used_cars.csv
Loaded weather: (13200, 11)
Loaded pakwheels: (77237, 14)


In [4]:
# Example path
weather_path = '/content/drive/MyDrive/dataset week 4 buildables/weather_classification_data.csv'
reg_path = '/content/drive/MyDrive/dataset week 4 buildables/pakwheels_used_cars.csv'

# Weather Classification — Week 4
**Objective:** Clean, analyze, visualize, and build classification models (Logistic Regression, Decision Tree, Random Forest).


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

RANDOM_STATE = 42

# paths
BASE = Path(r"Buildable-ML-DL-Fellowship\week_4")
CLASS_DIR = BASE / "classification"
FIG_DIR = CLASS_DIR / "document_figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
weather_path = CLASS_DIR / "weather_classification_data.csv"
df = pd.read_csv('/content/drive/MyDrive/dataset/assignment_dataset/weather_classification_data.csv')
# quick checks
display(df.head())
display(df.info())
display(df.describe(include='all').T)
print("Missing values:\n", df.isnull().sum())


Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


None

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Temperature,13200.0,,,,19.127576,17.386327,-25.0,4.0,21.0,31.0,109.0
Humidity,13200.0,,,,68.710833,20.194248,20.0,57.0,70.0,84.0,109.0
Wind Speed,13200.0,,,,9.832197,6.908704,0.0,5.0,9.0,13.5,48.5
Precipitation (%),13200.0,,,,53.644394,31.946541,0.0,19.0,58.0,82.0,109.0
Cloud Cover,13200.0,4.0,overcast,6090.0,,,,,,,
Atmospheric Pressure,13200.0,,,,1005.827896,37.199589,800.12,994.8,1007.65,1016.7725,1199.21
UV Index,13200.0,,,,4.005758,3.8566,0.0,1.0,3.0,7.0,14.0
Season,13200.0,4.0,Winter,5610.0,,,,,,,
Visibility (km),13200.0,,,,5.462917,3.371499,0.0,3.0,5.0,7.5,20.0
Location,13200.0,3.0,inland,4816.0,,,,,,,


Missing values:
 Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64


In [9]:
df.head()
df.info()
df.describe(include='all').T

df.isnull().sum().sort_values(ascending=False)
df.duplicated().sum()

for c in df.select_dtypes(include='object').columns:
    print(c, df[c].nunique())
    print(df[c].value_counts(dropna=False).head(10))
    print('---')

for c in ['Temperature','Humidity','Wind Speed','Precipitation (%)','Atmospheric Pressure','UV Index','Visibility (km)']:
    if c in df.columns:
        print(c, df[c].min(), df[c].max(), df[c].median(), df[c].isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB
Cloud Cover 4
Cloud Cover
overcast         6090
partly cloudy    4560
clear            2139
cloudy            411
Name: count, dtype: int64
---
Season 4
Season
Winter

In [None]:
# #Initial cleaning notes & plan
# Dataset overview

# Rows: 13,200 — Columns: 11.
# Target column: Weather Type with 4 classes (Rainy, Sunny, Cloudy, Snowy) — classes appear balanced (~3,300 each).
# Observed anomalies / issues

# Missing values

# From df.info() all columns show 13,200 non-null values, so there are no missing values detected in the dataset. (If any missing values appear later after coercion from strings to numeric, we will handle those.)
# Data types / formatting

# Categorical columns (object type): Cloud Cover, Season, Location, Weather Type.
# Numeric columns (float/int): Temperature, Humidity, Wind Speed, Precipitation (%), Atmospheric Pressure, UV Index, Visibility (km).
# Plan: ensure numeric columns are numeric (coerce if needed) and categorical columns remain as categories/strings ready for encoding.
# Duplicate rows

# Will check for exact duplicates and remove if any are found.
# Outliers / domain checks (observed ranges)

# Temperature: roughly -25 → 39 °C (plausible).
# Humidity: values appear within roughly 20 → 100% (100 is boundary; cap at 100% if any >100).
# Wind Speed: values up to ~50 km/h (some high values — treat with IQR capping).
# Precipitation (%): up to ~58% (acceptable but check for >100 or negative values).
# Atmospheric Pressure: broad range (~808 → 1199 hPa) — check for extreme values and use IQR clipping if needed.
# Visibility (km): values between ~0.2 → 8.0.
# UV Index: range 0 → ~14 (verify units).
# Plan: apply IQR-based capping for numeric columns (or clip to sensible domain bounds) to reduce the impact of extreme synthetic/noise values.
# Categorical issues

# Cloud Cover, Season, Location have a few categories (low cardinality). We will standardize values (strip whitespace / lower-case) and One-Hot Encode these features.
# Weather Type is the target: will Label-encode (and keep mapping recorded).
# Class balance

# Target classes appear balanced (≈ equal counts). Use stratified train/test split.
# Planned immediate steps

# Run duplicate check and remove exact duplicates (if any).
# Ensure numeric columns are numeric (pd.to_numeric(..., errors='coerce')); if coercion introduces NaNs, impute with median.
# Clip obvious domain errors (humidity >100 → set to 100; precipitation <0 or >100 → clip).
# Use IQR-based capping for wind speed, atmospheric pressure and other numeric columns as needed.
# Standardize categorical strings and One-Hot Encode them; Label-encode the target.
# Use train_test_split(..., stratify=y, test_size=0.2, random_state=42).
# For model pipelines: impute (median) → scale numeric → encode categorical → train models (LogReg, DecisionTree, RandomForest). For tree-based models you can skip scaling.
# Notes

# Because there are no missing values reported now, imputation may not be necessary — but coercing strings to numeric may introduce NaNs, so we will handle them if they appear.
# I will save cleaned CSV and figures to classification/document_figures/ and document exact changes.


In [None]:
display(df.shape)
display(df.info())    # shows non-null counts and dtypes
print("Missing values:\n", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())
print("Weather Type distribution:\n", df['Weather Type'].value_counts())


(13200, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


None

Missing values:
 Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64
Duplicate rows: 0
Weather Type distribution:
 Weather Type
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64


In [None]:
num_cols = ['Temperature','Humidity','Wind Speed','Precipitation (%)','Atmospheric Pressure','UV Index','Visibility (km)']
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
for c in num_cols:
    if c in df.columns:
        print(c, "min:", df[c].min(), "max:", df[c].max(), "median:", df[c].median(), "NaN:", df[c].isna().sum())


Temperature min: -25.0 max: 109.0 median: 21.0 NaN: 0
Humidity min: 20 max: 109 median: 70.0 NaN: 0
Wind Speed min: 0.0 max: 48.5 median: 9.0 NaN: 0
Precipitation (%) min: 0.0 max: 109.0 median: 58.0 NaN: 0
Atmospheric Pressure min: 800.12 max: 1199.21 median: 1007.65 NaN: 0
UV Index min: 0 max: 14 median: 3.0 NaN: 0
Visibility (km) min: 0.0 max: 20.0 median: 5.0 NaN: 0


In [11]:
dup_count = df.duplicated().sum()
if dup_count > 0:
    print(f"Removing {dup_count} duplicate rows.")
    df = df.drop_duplicates().reset_index(drop=True)
else:
    print("No exact duplicates found.")


No exact duplicates found.


In [None]:
def cap_iqr(series, factor=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - factor * iqr
    upper = q3 + factor * iqr
    return series.clip(lower, upper)

if 'Humidity' in df.columns:
    df['Humidity'] = df['Humidity'].clip(lower=0, upper=100)   # humidity % cannot exceed 100 or be negative
if 'Precipitation (%)' in df.columns:
    df['Precipitation (%)'] = df['Precipitation (%)'].clip(lower=0, upper=100)

for c in ['Temperature','Wind Speed','Atmospheric Pressure','Visibility (km)','UV Index']:
    if c in df.columns:
        df[c] = cap_iqr(df[c])

for c in num_cols:
    if c in df.columns:
        print(c, "min:", df[c].min(), "max:", df[c].max(), "median:", df[c].median())


Temperature min: -25.0 max: 71.5 median: 21.0
Humidity min: 20 max: 100 median: 70.0
Wind Speed min: 0.0 max: 26.25 median: 9.0
Precipitation (%) min: 0.0 max: 100.0 median: 58.0
Atmospheric Pressure min: 961.8412499999998 max: 1049.7312500000003 median: 1007.65
UV Index min: 0 max: 14 median: 3.0
Visibility (km) min: 0.0 max: 14.25 median: 5.0


In [None]:
cat_cols = ['Cloud Cover','Season','Location']
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip().str.title()  # trim + unify capitalization

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Weather_Label'] = le.fit_transform(df['Weather Type'])
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", mapping)

for c in cat_cols:
    if c in df.columns:
        print(c, "unique:", df[c].nunique(), df[c].value_counts().to_dict())


Label mapping: {'Cloudy': np.int64(0), 'Rainy': np.int64(1), 'Snowy': np.int64(2), 'Sunny': np.int64(3)}
Cloud Cover unique: 4 {'Overcast': 6090, 'Partly Cloudy': 4560, 'Clear': 2139, 'Cloudy': 411}
Season unique: 4 {'Winter': 5610, 'Spring': 2598, 'Autumn': 2500, 'Summer': 2492}
Location unique: 3 {'Inland': 4816, 'Mountain': 4813, 'Coastal': 3571}


In [None]:
clean_path = CLASS_DIR / "weather_classification_cleaned.csv"
df.to_csv(clean_path, index=False)
print("Saved cleaned data to:", clean_path)

X = df.drop(columns=['Weather Type','Weather_Label'])
y = df['Weather_Label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train label dist:\n", y_train.value_counts(normalize=True))
print("Test label dist:\n", y_test.value_counts(normalize=True))


Saved cleaned data to: Buildable-ML-DL-Fellowship\week_4/classification/weather_classification_cleaned.csv
Train shape: (10560, 10) Test shape: (2640, 10)
Train label dist:
 Weather_Label
2    0.25
3    0.25
1    0.25
0    0.25
Name: proportion, dtype: float64
Test label dist:
 Weather_Label
1    0.25
0    0.25
3    0.25
2    0.25
Name: proportion, dtype: float64


## Exploratory Data Analysis — Visualizations
We'll create histograms, boxplots, scatter plots, target distribution, and a correlation heatmap for numeric features. Save all figures to classification/figures/.


In [15]:
# Histograms for numeric features
numeric_cols = ['Temperature','Humidity','Wind Speed','Precipitation (%)','Atmospheric Pressure','UV Index','Visibility (km)']
for col in numeric_cols:
    if col in df.columns:
        plt.figure(figsize=(6,4))
        plt.hist(df[col].dropna(), bins=30)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col); plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"hist_{col.replace(' ','_').replace('%','pct')}.png")
        plt.close()

# Target class distribution
plt.figure(figsize=(6,4))
df['Weather Type'].value_counts().plot(kind='bar')
plt.title('Weather Type Distribution')
plt.xlabel('Weather Type'); plt.ylabel('Count')
plt.tight_layout()
plt.savefig(FIG_DIR / "target_distribution.png")
plt.close()
print("Saved histograms and target distribution to", FIG_DIR)


Saved histograms and target distribution to Buildable-ML-DL-Fellowship\week_4/classification/document_figures


In [None]:
for col in numeric_cols:
    if col in df.columns:
        plt.figure(figsize=(6,3))
        plt.boxplot(df[col].dropna(), vert=False)
        plt.title(f'Boxplot of {col}')
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"box_{col.replace(' ','_')}.png")
        plt.close()

pairs = [('Temperature','Humidity'), ('Visibility (km)','Precipitation (%)')]
for a,b in pairs:
    if a in df.columns and b in df.columns:
        plt.figure(figsize=(6,4))
        plt.scatter(df[a], df[b], alpha=0.4, s=10)
        plt.xlabel(a); plt.ylabel(b); plt.title(f'{a} vs {b}')
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"scatter_{a.replace(' ','_')}_vs_{b.replace(' ','_')}.png")
        plt.close()

print("Saved the boxplots and scatterplots.")


Saved boxplots and scatterplots.


In [None]:
num_present = [c for c in numeric_cols if c in df.columns]
if len(num_present) > 1:
    plt.figure(figsize=(8,6))
    corr = df[num_present].corr()
    import seaborn as sns
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title('Correlation heatmap (numeric features)')
    plt.tight_layout()
    plt.savefig(FIG_DIR / "correlation_heatmap.png")
    plt.close()
    print("Saved correlation heatmap.")
else:
    print("Not enough numeric cols for correlation heatmap.")


Saved correlation heatmap.


## Prepare preprocessing pipeline
We will create a ColumnTransformer pipeline: numeric → median imputer + StandardScaler; categorical → most_frequent imputer + OneHotEncoder(handle_unknown='ignore').
Then we'll run train/test split (stratified) and produce processed train/test arrays for model training.


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Weather Type','Weather_Label'])
y = df['Weather_Label']

#..........
numeric_features = [c for c in X.select_dtypes(include=['number']).columns]
categorical_features = [c for c in X.select_dtypes(include=['object','category']).columns]

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # 👈 fixed here
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, numeric_features),
    ('cat', cat_pipe, categorical_features)
])

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train = preprocessor.fit_transform(X_train_raw)
X_test = preprocessor.transform(X_test_raw)

print("the Processed shapes are :", X_train.shape, X_test.shape)


Processed shapes: (10560, 18) (2640, 18)


## Train and evaluate three classification models:
1. Logistic Regression (baseline; linear model)
2. Decision Tree (non-linear, interpretable)
3. Random Forest (ensemble, robust)
We'll compute accuracy, precision, recall, f1 (macro), and show confusion matrices.


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd

def evaluate_model(clf, Xtr, Xte, ytr, yte, name):
    clf.fit(Xtr, ytr)
    ypred = clf.predict(Xte)
    res = {
        'model': name,
        'accuracy': accuracy_score(yte, ypred),
        'precision_macro': precision_score(yte, ypred, average='macro', zero_division=0),
        'recall_macro': recall_score(yte, ypred, average='macro', zero_division=0),
        'f1_macro': f1_score(yte, ypred, average='macro', zero_division=0),
        'confusion_matrix': confusion_matrix(yte, ypred)
    }
    print(f"--- {name} ---")
    print(classification_report(yte, ypred, target_names=le.classes_))
    return res

results = []
lr = LogisticRegression(max_iter=1000, random_state=42)
results.append(evaluate_model(lr, X_train, X_test, y_train, y_test, "Logistic Regression"))

dt = DecisionTreeClassifier(random_state=42)
results.append(evaluate_model(dt, X_train, X_test, y_train, y_test, "Decision Tree"))

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
results.append(evaluate_model(rf, X_train, X_test, y_train, y_test, "Random Forest"))

summary_df = pd.DataFrame([{k:v for k,v in r.items() if k!='confusion_matrix'} for r in results]).set_index('model')
display(summary_df)
summary_df.to_csv(CLASS_DIR / "classification_model_summary.csv")
print("Saved model summary to", CLASS_DIR / "classification_model_summary.csv")


--- Logistic Regression ---
              precision    recall  f1-score   support

      Cloudy       0.82      0.85      0.83       660
       Rainy       0.86      0.86      0.86       660
       Snowy       0.89      0.92      0.90       660
       Sunny       0.93      0.86      0.89       660

    accuracy                           0.87      2640
   macro avg       0.87      0.87      0.87      2640
weighted avg       0.87      0.87      0.87      2640

--- Decision Tree ---
              precision    recall  f1-score   support

      Cloudy       0.87      0.90      0.88       660
       Rainy       0.89      0.90      0.89       660
       Snowy       0.94      0.91      0.93       660
       Sunny       0.93      0.91      0.92       660

    accuracy                           0.91      2640
   macro avg       0.91      0.91      0.91      2640
weighted avg       0.91      0.91      0.91      2640

--- Random Forest ---
              precision    recall  f1-score   support

   

Unnamed: 0_level_0,accuracy,precision_macro,recall_macro,f1_macro
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.87197,0.873465,0.87197,0.872241
Decision Tree,0.906061,0.906974,0.906061,0.906316
Random Forest,0.911364,0.912307,0.911364,0.911566


Saved model summary to Buildable-ML-DL-Fellowship\week_4/classification/classification_model_summary.csv


In [20]:
import seaborn as sns
for r in results:
    cm = r['confusion_matrix']
    model_name = r['model'].replace(' ','_')
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
    plt.xlabel('Predicted'); plt.ylabel('True'); plt.title(f'Confusion matrix - {r["model"]}')
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"cm_{model_name}.png")
    plt.close()

plt.figure(figsize=(7,4))
summary_df[['accuracy','f1_macro']].plot(kind='bar')
plt.title('Model comparison (accuracy & f1_macro)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(FIG_DIR / "model_comparison_scores.png")
plt.close()
print("Saved confusion matrices and comparison plot to", FIG_DIR)


Saved confusion matrices and comparison plot to Buildable-ML-DL-Fellowship\week_4/classification/document_figures


<Figure size 700x400 with 0 Axes>

In [21]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators':[100,200], 'max_depth':[None,10,20]}
gs = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1_macro', n_jobs=-1)
gs.fit(X_train, y_train)
print("Best params:", gs.best_params_, "Best score:", gs.best_score_)
best_rf = gs.best_estimator_
# Evaluate best
res_best = evaluate_model(best_rf, X_train, X_test, y_train, y_test, "RandomForest_Tuned")


Best params: {'max_depth': 10, 'n_estimators': 200} Best score: 0.9162361816784271
--- RandomForest_Tuned ---
              precision    recall  f1-score   support

      Cloudy       0.87      0.91      0.89       660
       Rainy       0.88      0.92      0.90       660
       Snowy       0.96      0.91      0.93       660
       Sunny       0.94      0.91      0.92       660

    accuracy                           0.91      2640
   macro avg       0.91      0.91      0.91      2640
weighted avg       0.91      0.91      0.91      2640



In [22]:
import joblib
final_clf = Pipeline([('preproc', preprocessor), ('clf', best_rf if 'best_rf' in globals() else rf)])
joblib.dump(final_clf, CLASS_DIR / "final_weather_pipeline.joblib")
print("Saved final pipeline to", CLASS_DIR / "final_weather_pipeline.joblib")


Saved final pipeline to Buildable-ML-DL-Fellowship\week_4/classification/final_weather_pipeline.joblib


In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

rf_model = None
if 'best_rf' in globals():
    rf_model = best_rf
elif 'rf' in globals():
    rf_model = rf
elif 'gs' in globals() and hasattr(gs, 'best_estimator_'):
    rf_model = gs.best_estimator_

if rf_model is None:
    print("No RandomForest model found in globals (rf/best_rf/gs). Skipping feature importance.")
else:
    try:
        feature_names = None
        if 'preprocessor' in globals():
            try:
                feature_names = preprocessor.get_feature_names_out()
            except Exception:
                num_features = (preprocessor.transformers_[0][2]
                                if len(preprocessor.transformers_)>0 else [])
                num_feats = [c for c in X_train_raw.select_dtypes(include=['number']).columns] if 'X_train_raw' in globals() else []
                cat_feats = [c for c in X_train_raw.select_dtypes(include=['object','category']).columns] if 'X_train_raw' in globals() else []
                feature_names = num_feats + ['ohe_'+c for c in cat_feats]
        else:
            feature_names = X_train_raw.columns.tolist() if 'X_train_raw' in globals() else None

        importances = rf_model.feature_importances_
        if feature_names is None or len(feature_names) != len(importances):
            feature_names = [f'feat_{i}' for i in range(len(importances))]
        fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
        fi_df = fi_df.sort_values('importance', ascending=False).head(30)

        plt.figure(figsize=(8,6))
        plt.barh(fi_df['feature'][::-1], fi_df['importance'][::-1])
        plt.xlabel('Importance'); plt.title('Top 30 Feature Importances (Random Forest)')
        plt.tight_layout()
        plt.savefig(FIG_DIR / "rf_feature_importances_top30.png")
        plt.close()
        display(fi_df)
        print("Saved feature importances to", FIG_DIR / "rf_feature_importances_top30.png")
    except Exception as e:
        print("Could not compute feature importances:", e)


Unnamed: 0,feature,importance
0,num__Temperature,0.198224
6,num__Visibility (km),0.143729
3,num__Precipitation (%),0.131273
5,num__UV Index,0.128883
4,num__Atmospheric Pressure,0.112516
7,cat__Cloud Cover_Clear,0.081013
1,num__Humidity,0.054725
14,cat__Season_Winter,0.046587
2,num__Wind Speed,0.028555
9,cat__Cloud Cover_Overcast,0.025502


Saved feature importances to Buildable-ML-DL-Fellowship\week_4/classification/document_figures/rf_feature_importances_top30.png


In [24]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import numpy as np

if 'rf_model' not in globals() and 'best_rf' not in globals() and 'rf' not in globals():
    print("No RF/logreg model available for probability predictions. Skipping ROC AUC plots.")
else:
    clf_for_proba = rf_model if 'rf_model' in globals() else (best_rf if 'best_rf' in globals() else rf)
    if not hasattr(clf_for_proba, "predict_proba"):
        print("Model has no predict_proba method. Skipping ROC AUC.")
    else:
        y_true = y_test
        n_classes = len(np.unique(y_true))
        y_test_bin = label_binarize(y_true, classes=np.unique(y_true))
        y_score = clf_for_proba.predict_proba(X_test)

        aucs = {}
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            aucs[i] = auc(fpr, tpr)
        print("Per-class AUCs:", aucs)
        # macro
        macro_auc = np.mean(list(aucs.values()))
        print("Macro AUC:", macro_auc)


Per-class AUCs: {0: np.float64(0.9916322314049587), 1: np.float64(0.9922252831343741), 2: np.float64(0.9952448729721457), 3: np.float64(0.9939455157636975)}
Macro AUC: 0.993261975818794


In [25]:
import joblib, os
clean_path = CLASS_DIR / "weather_classification_cleaned.csv"
if clean_path.exists():
    print("Cleaned CSV exists:", clean_path)
else:
    df.to_csv(clean_path, index=False)
    print("Saved cleaned CSV:", clean_path)


try:
    final_model = None
    if 'best_rf' in globals():
        final_model = best_rf
    elif 'rf' in globals():
        final_model = rf
    elif 'gs' in globals() and hasattr(gs, 'best_estimator_'):
        final_model = gs.best_estimator_

    if final_model is not None and 'preprocessor' in globals():
        final_pipe = Pipeline([('preproc', preprocessor), ('clf', final_model)])
        joblib.dump(final_pipe, CLASS_DIR / "final_weather_pipeline.joblib")
        print("Saved final pipeline to", CLASS_DIR / "final_weather_pipeline.joblib")
    else:
        print("No final pipeline saved (missing model or preprocessor).")
except Exception as e:
    print("Error saving pipeline:", e)


Cleaned CSV exists: Buildable-ML-DL-Fellowship\week_4/classification/weather_classification_cleaned.csv
Saved final pipeline to Buildable-ML-DL-Fellowship\week_4/classification/final_weather_pipeline.joblib
