In [1]:
# run if need to install
!pip install ucimlrepo



# Dataset & Preprocessing:

Load in the UCI Covertype Dataset

In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
covertype = fetch_ucirepo(id=31)

# data (as pandas dataframes)
X = covertype.data.features
y = covertype.data.targets

# print basic info
print("Feature DataFrame Shape:", X.shape)
print("Target DataFrame Shape:", y.shape)
print("Head of features:")
print(X.head())

print("\nHead of target:")
print(y.head())

Feature DataFrame Shape: (581012, 54)
Target DataFrame Shape: (581012, 1)
Head of features:
   Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0       2596      51      3                               258   
1       2590      56      2                               212   
2       2804     139      9                               268   
3       2785     155     18                               242   
4       2595      45      2                               153   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0            221             232            1

We don't need to handle missing values because the dataset specifices that there are no missing values within the information, but we can check just to ensure.

In [3]:
missing_values = X.isnull().sum().sum()
print("Total missing values in feature:", missing_values)

missing_values_target = y.isnull().sum().sum()
print("Total missing values in target:", missing_values_target)

Total missing values in feature: 0
Total missing values in target: 0


We can see we have no missing values. Let's now normalize numerical features and shuffle our data (just in case) since cover_type, our target variable, is a categorical variable and we want representation of each cover_type in the training set. Let's also split the data into a training and validation set.

In [4]:
import numpy as np

# create a shuffled index array
indices = np.random.permutation(len(X))

# len of X is 581012
lenX = len(X)
train_size = int(lenX*0.7)

X_train, y_train  = X.iloc[indices[:train_size]], y.iloc[indices[:train_size]]
X_val, y_val = X.iloc[indices[train_size:]], y.iloc[indices[train_size:]]

# check sample sizes
print("Number of samples in train: ", len(X_train)) # should return 406708
print("Number of samples in val: ", len(X_val)) # should return 174304

Number of samples in train:  406708
Number of samples in val:  174304


The numerical features are in Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshade_9am, Hillshade_Noon, Hillshade_3pm, and Horizontal_Distance_To_Fire_Points. The rest of the variables in X, such as Wilderness_Area and Soil_Type, are binary so we don't need to normalize them.

In [5]:
from sklearn.preprocessing import StandardScaler

numerical_features = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
                      "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
                      "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
                      "Horizontal_Distance_To_Fire_Points"]

# standardize function
scaler = StandardScaler()

# fit only on training data on only numerical features
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

# transform validation data using the same scaler
X_val[numerical_features] = scaler.transform(X_val[numerical_features])

# check standardization
print(X_train.head())
print(X_val.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[numerical_features] = scaler.transform(X_val[numerical_features])


        Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
155860  -0.240702 -0.308953  0.520116                         -0.759581   
323222  -0.725952 -0.791421 -0.948235                         -0.505524   
252813   0.041172 -1.113066  0.386630                         -0.467886   
360232  -0.287086 -1.354300  0.253143                          0.614208   
486821  -0.122957  0.387946  1.588008                         -0.820743   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
155860                       -0.470283                         0.634815   
323222                       -0.607554                        -1.235363   
252813                       -0.401647                        -1.299498   
360232                       -0.075627                        -0.216899   
486821                       -0.144262                        -0.433675   

        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
155860       1.338694       -0.015374     

The last step of preprocessing the data is encoding categorical variables. Our categorical variables (Soil_Type and Wilderness_Area) are already binary. But we need to encode our target variable Cover_Type:

In [6]:
from tensorflow.keras.utils import to_categorical

# convert Cover_Type range from 1-7 to 0-6
y_train = y_train - 1
y_val = y_val - 1

# one-hot encode the cover_type
y_train_encoded = to_categorical(y_train, num_classes=7)  # 7 classes for Cover_Type
y_val_encoded = to_categorical(y_val, num_classes=7)


# Model Architecture:

In [7]:
import tensorflow as tf
from tensorflow.keras import layers, models

# x features should be number of input dims
input_dim = X_train.shape[1]
inputs = layers.Input(shape=(input_dim,))

# first dense layer to process input
x = layers.Dense(64, activation='relu')(inputs)
x = layers.Dense(64, activation='relu')(x)

# custom residual block with at least two Dense layers
residual_input = x
x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)

# linear projection bc dims differ
lin_proj = layers.Dense(32)(residual_input)

# Add residual connection
x = layers.Add()([x, lin_proj])

# skip connection
skip_input = layers.Dense(32, activation='relu')(inputs)
x = layers.Add()([x, skip_input])

# last layers
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)

# output Layer (7 classes for target)
output = layers.Dense(7, activation='softmax')(x)

# create/compile model
model = models.Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()
model.save("my_model.h5")



# Training and Evaluation

In [12]:
# select a single batch of 128
X_batch = X_train[:128]
y_batch = y_train_encoded[:128]

# overfit on just this batch
history = model.fit(
    X_batch, y_batch,
    batch_size=128,
    epochs=40,  # or until overfit
    verbose=1
)

Epoch 1/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 1.0000 - loss: 0.0096 - val_accuracy: 0.6268 - val_loss: 2.1406
Epoch 2/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step - accuracy: 1.0000 - loss: 0.0092 - val_accuracy: 0.6265 - val_loss: 2.1540
Epoch 3/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 1.0000 - loss: 0.0088 - val_accuracy: 0.6263 - val_loss: 2.1666
Epoch 4/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 1.0000 - loss: 0.0084 - val_accuracy: 0.6263 - val_loss: 2.1788
Epoch 5/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step - accuracy: 1.0000 - loss: 0.0080 - val_accuracy: 0.6263 - val_loss: 2.1904
Epoch 6/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step - accuracy: 1.0000 - loss: 0.0077 - val_accuracy: 0.6263 - val_loss: 2.2020
Epoch 7/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [14]:
# evaluate on validation set
val_eval = model.evaluate(X_val, y_val_encoded)

# final conclusions
print(val_eval)

[1m5447/5447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.6257 - loss: 2.4897
[2.4870052337646484, 0.6243287324905396]


we can see this model performs poorly with an accuracy of 0.6257 and a loss of 2.4897 showing a lack of generalization.

In [15]:
# eval on training batch (should be close to 0)
train_loss, train_acc = model.evaluate(X_batch, y_batch, verbose=0)

# eval on full validation set (should be high)
val_loss, val_acc = model.evaluate(X_val, y_val_encoded, verbose=0)

print("====== Overfitting Experiment Results ======")
print(f"Number of parameters: {model.count_params()}")
print(f"Final training loss: {train_loss:.4f}") # close enough to 0
print(f"Final validation loss: {val_loss:.4f}")

Number of parameters: 19079
Final training loss: 0.0024
Final validation loss: 2.4870
