In [50]:
from time import ctime
from uuid import uuid4
from os.path import join
from copy import deepcopy
from tqdm import tqdm as TQ

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%precision 3
%matplotlib inline
sns.set_style('whitegrid');
plt.style.use('default-style');
np.set_printoptions(precision = 3, threshold = 15)

In [3]:
from sklearn.preprocessing import (
    MinMaxScaler,
    LabelEncoder
)

In [4]:
import tensorflow as tf
print('Tensorflow Version: {}'.format(tf.__version__))

# check physical devices
tf.config.list_physical_devices()

Tensorflow Version: 2.3.1


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]

In [5]:
# ignore specific warnings
import warnings
warnings.simplefilter("ignore", FutureWarning)

In [6]:
# https://www.analyticsvidhya.com/blog/2021/04/how-to-reduce-memory-usage-in-python-pandas/
# https://towardsdatascience.com/reducing-memory-usage-in-pandas-with-smaller-datatypes-b527635830af

calculateMemory = lambda frame : frame.memory_usage(deep = True).sum() / 1024 ** 2 # return usage in MB

def limitNumeric(frame : pd.DataFrame, verbose : bool = True, **kwargs) -> pd.DataFrame:
    """Given a DataFrame (frame) - the function considers each numeric columns (integer and/or float) and sets the data type to any of `np.dtypes` to Reduce Memory Usage"""
    
    if verbose:
        actual = calculateMemory(frame)
    
    frame = deepcopy(frame)
    
    # foreach column calculate the min and max value
    # and map the data to its relevant unit category - int8, int16, int32 or int64
    # by default - pandas treats each numeric column to its highest number base - int64/float64
    for col in TQ(frame.columns, desc = "converting dtypes"):
        c_min = frame[col].min()
        c_max = frame[col].max()
        
        if c_min > np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
            frame[col] = frame[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
            frame[col] = frame[col].astype(np.int16)
        if c_min > np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
            frame[col] = frame[col].astype(np.int32)
        else:
            frame[col] = frame[col].astype(np.int64)
            
    if verbose:
        final = calculateMemory(frame)
        print(f"Actual Size : {actual:.2f} MB | Final Size : {final:.2f} MB || Reduction Ration = {((actual - final) / actual) * 100:.2f}%")
        
    return frame

def loadData(path : str, reduce_memory : bool = True, **kwargs) -> pd.DataFrame:
    """Load a CSV File into Memory, with Optional Arguments"""
    
    frame = pd.read_csv(path, **kwargs)
    
    if reduce_memory: # reduces the dtypes
        frame = limitNumeric(frame, **kwargs)
        
    return frame

In [7]:
TRAIN_DATA = join(".", "output", "train-100.csv")
EVALUATION_DATA = join(".", "output", "test-100.csv")

In [8]:
dataTrain = loadData(TRAIN_DATA)
dataTrain.sample(5)

converting dtypes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:25<00:00,  2.54it/s]


Actual Size : 2777.21 MB | Final Size : 1388.60 MB || Reduction Ration = 50.00%


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,distanceHydro,distanceRoads,distanceFires,soil_type_count_r,soil_type_count_vs,soil_type_count_es,soil_type_count_s,soil_type_count_total,wilderness_area_count,Cover_Type
1116045,2199,60,29,18,16,1206,236,214,136,1224,...,24,2510,2520,0,0,0,0,1,1,4
5411671,2953,114,39,97,111,981,181,209,184,7633,...,147,3111,8184,1,0,0,0,1,1,5
1286810,2298,153,37,64,35,1112,225,217,165,809,...,74,2554,2437,0,0,0,0,1,1,6
557347,2460,3,21,40,20,277,199,218,149,1029,...,45,2476,2667,0,0,0,0,1,1,6
640682,2531,53,14,53,0,2624,178,223,183,1102,...,53,3646,2761,1,0,0,0,1,1,6


In [9]:
dataTest = loadData(EVALUATION_DATA)
dataTest.sample(5)

converting dtypes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:04<00:00, 15.16it/s]


Actual Size : 480.65 MB | Final Size : 240.33 MB || Reduction Ration = 50.00%


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type40,distanceHydro,distanceRoads,distanceFires,soil_type_count_r,soil_type_count_vs,soil_type_count_es,soil_type_count_s,soil_type_count_total,wilderness_area_count
578810,2378,310,15,98,34,674,185,233,102,1087,...,1,103,2471,2614,0,0,1,1,1,1
58958,3388,243,26,514,-9,3218,203,251,182,3811,...,0,514,4672,5099,0,0,0,0,0,0
489560,2960,277,25,177,26,1721,223,238,179,1926,...,0,178,3423,3531,1,0,0,0,1,0
663634,3568,14,0,147,4,1265,203,216,128,734,...,0,147,3785,3642,0,0,0,0,0,1
149057,2665,-6,29,416,98,941,191,227,131,596,...,0,427,2826,2730,0,0,1,1,2,1


In [10]:
scaler = MinMaxScaler()
encoder = LabelEncoder()

In [31]:
scaler.fit(dataTrain.drop(columns = "Cover_Type"));
# encoder.fit(dataTrain.Cover_Type);

In [14]:
INPUT_SHAPE = XScaled.shape[1]
OUTPUT_SHAPE = encoder.classes_.shape[0]

INPUT_SHAPE, OUTPUT_SHAPE

(63, 7)

In [33]:
testScaled = scaler.transform(dataTest)

In [16]:
def ANN(modelName : str):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, input_shape = (INPUT_SHAPE, ), activation = "relu", name = "iLayer"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, activation = "relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(256, activation = "relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation = "relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(OUTPUT_SHAPE, activation = "softmax", name = "oLayer"),
    ], name = modelName)
    
    model.compile(
        optimizer = "adam",
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    )
    
    return model

In [17]:
models = [ANN("DCNV2-" + str(uuid4())[:7].upper()) for _ in TQ(range(5))] # 2262087 / 391500 ~ 5.78

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.04s/it]


In [34]:
dataSubsets = [
    pd.concat([
        dataTrain[dataTrain.Cover_Type > 2],
        dataTrain[dataTrain.Cover_Type == 1].sample(391500),
        dataTrain[dataTrain.Cover_Type == 2].sample(391500),
    ])
for _ in TQ(range(5))]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.56s/it]


In [44]:
def getXY(frame : pd.DataFrame):
    frame = deepcopy(frame)
    
    X = frame.drop(columns = "Cover_Type")
    y = frame.Cover_Type
    
    X = scaler.transform(X)
    y = encoder.fit_transform(y)
    
    return X, y

In [53]:
histories = []
predictions = []
predictionsMax = []
for idx, (data, model) in enumerate(zip(dataSubsets, models)):
    print(ctime(), f"Model-#{idx + 1}")
    
    X, y = getXY(data)
    histories.append(model.fit(X, y, epochs = 10, batch_size = 512))
    
    yPredicted = model.predict(testScaled)
    predictions.append(yPredicted)
    predictionsMax.append([np.argmax(i) + 1 for i in yPredicted])

Fri Dec 17 00:58:20 2021 Model-#1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fri Dec 17 01:00:31 2021 Model-#2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fri Dec 17 01:02:44 2021 Model-#3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fri Dec 17 01:04:57 2021 Model-#4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fri Dec 17 01:07:12 2021 Model-#5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [86]:
from collections import Counter

In [90]:
majorityVote = []
for i in TQ(range(1000000)):
    majorityVote.append(Counter([
        predictionsMax[0][i],
        predictionsMax[1][i],
        predictionsMax[2][i],
        predictionsMax[3][i],
        predictionsMax[4][i]
    ]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 358139.94it/s]


In [96]:
majorityVote[802]

Counter({2: 5})