In [1]:
!pip install wandb



In [1]:
import wandb
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.datasets import fetch_covtype

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbrinashong[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
%run 'common.ipynb'

In [4]:
# Function to evaluate and print model performance
def evaluate_model(model, X_train, y_train, X_test, y_test):
    seconds = time.time()
    # Train the model
    model.fit(X_train, y_train)
    training_time = time.time() - seconds
    print("Train operation time: = ", training_time, "seconds")
    wandb.log({"Training time (seconds)": training_time})
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Print model evaluation metrics
    return evaluate(y_test, y_pred, f"\nModel: {model.__class__.__name__}")

In [5]:
# Dictionary to store models and their names
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine (SVM)": SVC()
}

In [6]:
%run covertype.ipynb

Normal class:  0    2
dtype: int32
Feature names:  ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Soil_Type_0', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39']


In [7]:
# Should already be one hot encoded and label encoded
all_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [8]:
# Get X and y from all_df
X_df = all_df.drop(columns=[TARGET_COLUMN])
y_df = all_df[TARGET_COLUMN]

# Split the data into training and testing sets (80% train, 20% test)
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, test_size=0.2, random_state=42)
print(X_train_df.shape, X_test_df.shape, y_train_df.shape, y_test_df.shape)

(464809, 54) (116203, 54) (464809,) (116203,)


In [9]:
# Scale the features (required for some models, especially SVM)
scaler = StandardScaler()

X_train_scaled_df = X_train_df.copy()
X_train_scaled_df[NUMERICAL_COLUMNS] = scaler.fit_transform(X_train_scaled_df[NUMERICAL_COLUMNS])

X_test_scaled_df = X_test_df.copy()
X_test_scaled_df[NUMERICAL_COLUMNS] = scaler.transform(X_test_scaled_df[NUMERICAL_COLUMNS])

In [10]:
# Loop through models and evaluate each one
for model_name, model in models.items():
    wandb.init(project="ML-model-baselines-covertype-dataset", name=model_name)
    
    # For SVM and Logistic Regression, use scaled data
    if model_name in ["Logistic Regression", "Support Vector Machine (SVM)"]:
        conf_matrix, class_report, acc_score = evaluate_model(model, X_train_scaled_df, y_train_df, X_test_scaled_df, y_test_df)
        wandb_log(conf_matrix, class_report, acc_score)
    else:
        conf_matrix, class_report, acc_score = evaluate_model(model, X_train_df, y_train_df, X_test_df, y_test_df)
        wandb_log(conf_matrix, class_report, acc_score)

    wandb.finish()

Train operation time: =  0.021079540252685547 seconds

Model: KNeighborsClassifier
Confusion Matrix:
[[41203  1251     1     0    28     3    71]
 [ 1138 55110    71     0   119    47    15]
 [    3    84  6911    18     7    98     0]
 [    0     1    74   416     0    35     0]
 [   25   154    17     0  1789    10     0]
 [    4    83   124    16     8  3254     0]
 [  108    24     0     0     0     0  3883]]

Classification Report:
{'1': {'precision': 0.9699159624302629, 'recall': 0.9681838475456447, 'f1-score': 0.9690491309767398, 'support': 42557}, '2': {'precision': 0.9718376919956971, 'recall': 0.9753982300884956, 'f1-score': 0.9736147058044115, 'support': 56500}, '3': {'precision': 0.9601278132814671, 'recall': 0.9705097598651875, 'f1-score': 0.9652908722676166, 'support': 7121}, '4': {'precision': 0.9244444444444444, 'recall': 0.7908745247148289, 'f1-score': 0.8524590163934426, 'support': 526}, '5': {'precision': 0.9169656586365966, 'recall': 0.8967418546365915, 'f1-score': 

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.9581977878985036, max=1.0…

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.9687
Training time (seconds),0.02108
f1-score_avg,0.96864
precision_avg,0.96865
recall_avg,0.9687


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114269066658583, max=1.0…

Train operation time: =  6591.288116931915 seconds

Model: SVC
Confusion Matrix:
[[33729  8427     8     0     4    12   377]
 [ 5307 50502   280     2    77   295    37]
 [    1   451  6242    36     1   390     0]
 [    0     0   134   365     0    27     0]
 [  136  1204    44     0   598    13     0]
 [    1   450  1035    26     1  1976     0]
 [  653    21     0     0     0     0  3341]]

Classification Report:
{'1': {'precision': 0.846887789690411, 'recall': 0.7925605658293583, 'f1-score': 0.8188240435035928, 'support': 42557}, '2': {'precision': 0.8271558430922938, 'recall': 0.8938407079646018, 'f1-score': 0.8592063289524052, 'support': 56500}, '3': {'precision': 0.8061474880537259, 'recall': 0.8765622805785704, 'f1-score': 0.8398815931108717, 'support': 7121}, '4': {'precision': 0.8508158508158508, 'recall': 0.6939163498098859, 'f1-score': 0.7643979057591623, 'support': 526}, '5': {'precision': 0.8781204111600588, 'recall': 0.29974937343358393, 'f1-score': 0.44693572496263073,

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.956994482310938, max=1.0)…

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.83262
Training time (seconds),6591.28812
f1-score_avg,0.82909
precision_avg,0.83327
recall_avg,0.83262


In [None]:
wandb.finish()