In [1]:
from os.path import join
from copy import deepcopy
from tqdm import tqdm as TQ

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%precision 3
%matplotlib inline
sns.set_style('whitegrid');
plt.style.use('default-style');
np.set_printoptions(precision = 3, threshold = 15)

In [3]:
# ignore specific warnings
import warnings
warnings.simplefilter("ignore", FutureWarning)

In [4]:
from sklearn.preprocessing import (
    MinMaxScaler,
    LabelEncoder
)

In [5]:
import tensorflow as tf
print('Tensorflow Version: {}'.format(tf.__version__))

# check physical devices
tf.config.list_physical_devices()

Tensorflow Version: 2.3.1


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]

In [6]:
# https://www.analyticsvidhya.com/blog/2021/04/how-to-reduce-memory-usage-in-python-pandas/
# https://towardsdatascience.com/reducing-memory-usage-in-pandas-with-smaller-datatypes-b527635830af

calculateMemory = lambda frame : frame.memory_usage(deep = True).sum() / 1024 ** 2 # return usage in MB

def limitNumeric(frame : pd.DataFrame, verbose : bool = True) -> pd.DataFrame:
    """Given a DataFrame (frame) - the function considers each numeric columns (integer and/or float) and sets the data type to any of `np.dtypes` to Reduce Memory Usage"""
    
    if verbose:
        actual = calculateMemory(frame)
    
    frame = deepcopy(frame)
    
    # foreach column calculate the min and max value
    # and map the data to its relevant unit category - int8, int16, int32 or int64
    # by default - pandas treats each numeric column to its highest number base - int64/float64
    for col in TQ(frame.columns, desc = "converting dtypes"):
        c_min = frame[col].min()
        c_max = frame[col].max()
        
        if c_min > np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
            frame[col] = frame[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
            frame[col] = frame[col].astype(np.int16)
        if c_min > np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
            frame[col] = frame[col].astype(np.int32)
        else:
            frame[col] = frame[col].astype(np.int64)
            
    if verbose:
        final = calculateMemory(frame)
        print(f"Actual Size : {actual:.2f} MB | Final Size : {final:.2f} MB || Reduction Ration = {((actual - final) / actual) * 100:.2f}%")
        
    return frame

#### Loading Data

In [7]:
TRAIN_DATA = join(".", "output", "trainSynthesized.csv")
EVALUATION_DATA = join(".", "output", "testAddedFeatures.csv")

In [34]:
dataTrain = pd.read_csv(TRAIN_DATA)
dataTrain = limitNumeric(dataTrain)

dataTrain.head()

converting dtypes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [00:23<00:00,  2.44it/s]


Actual Size : 2516.85 MB | Final Size : 1258.42 MB || Reduction Ration = 50.00%


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,distance,soil_type_count,wilderness_area_count,Cover_Type
0,2017,345,26,136,155,1341,108,178,161,652,...,0,0,0,0,0,0,206,1,1,3
1,2327,335,27,162,59,133,147,144,173,383,...,0,0,0,0,0,0,172,1,1,3
2,2089,309,20,226,97,911,164,216,212,621,...,0,0,0,0,0,0,245,1,1,3
3,2171,339,34,141,112,1385,161,198,180,1085,...,0,0,0,0,0,0,180,1,1,3
4,2211,274,24,263,65,1158,172,236,220,977,...,0,0,0,0,0,0,270,1,1,3


In [35]:
dataTest = pd.read_csv(EVALUATION_DATA)
dataTest = limitNumeric(dataTest)

dataTest.head()

converting dtypes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:03<00:00, 15.43it/s]


Actual Size : 434.88 MB | Final Size : 217.44 MB || Reduction Ration = 50.00%


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,distance,soil_type_count,wilderness_area_count
0,2763,78,20,377,88,3104,218,213,195,1931,...,0,0,0,0,0,0,0,387,0,0
1,2826,153,11,264,39,295,219,238,148,2557,...,0,0,0,0,0,0,0,266,1,0
2,2948,57,19,56,44,852,202,217,163,1803,...,0,0,0,0,0,0,0,71,1,0
3,2926,119,6,158,134,2136,234,240,142,857,...,0,0,0,0,0,0,0,207,0,2
4,2690,10,4,38,108,3589,213,221,229,431,...,0,0,0,0,0,0,0,114,1,1


#### Feature Engineering

In [36]:
X = dataTrain.drop(columns = "Cover_Type")
y = dataTrain.Cover_Type

In [37]:
scaler = MinMaxScaler()
encoder = LabelEncoder()

In [38]:
XScaled = scaler.fit_transform(X)
yScaled = encoder.fit_transform(y)

In [39]:
testScaled = scaler.transform(dataTest)

#### Creating Model

In [40]:
INPUT_SHAPE = XScaled.shape[1]
OUTPUT_SHAPE = encoder.classes_.shape[0]

INPUT_SHAPE, OUTPUT_SHAPE

(57, 7)

In [51]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, input_shape = (INPUT_SHAPE, ), activation = "relu", kernel_initializer = "lecun_normal", name = "iLayer"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation = "relu", kernel_initializer = "lecun_normal"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation = "relu", kernel_initializer = "lecun_normal"),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(32, activation = "relu", kernel_initializer = "lecun_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(OUTPUT_SHAPE, activation = "softmax", name = "oLayer"),
], name = "DFC-1.1.0")

model.summary(line_length = 127)

Model: "DFC-1.1.0"
_______________________________________________________________________________________________________________________________
Layer (type)                                             Output Shape                                      Param #             
iLayer (Dense)                                           (None, 64)                                        3712                
_______________________________________________________________________________________________________________________________
dropout_61 (Dropout)                                     (None, 64)                                        0                   
_______________________________________________________________________________________________________________________________
dense_61 (Dense)                                         (None, 128)                                       8320                
_____________________________________________________________________________________

In [52]:
model.compile(
    optimizer = "adam",
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [53]:
model.fit(XScaled, yScaled, epochs = 10, batch_size = 512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x21bd4bc7a30>

In [None]:
yPredicted = model.predict(testScaled)

# considering argmax
yPredictedMax = [np.argmax(i) + 1 for i in yPredicted]