In [6]:
from os.path import join
from copy import deepcopy
from tqdm import tqdm as TQ

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%precision 3
%matplotlib inline
sns.set_style('whitegrid');
plt.style.use('default-style');
np.set_printoptions(precision = 3, threshold = 15)

In [8]:
# ignore specific warnings
import warnings
warnings.simplefilter("ignore", FutureWarning)

In [36]:
from imblearn.over_sampling import (
    SMOTE, # https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis#SMOTE
    ADASYN # https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis#ADASYN
)

from sklearn.preprocessing import (
    MinMaxScaler,
    LabelEncoder
)

In [5]:
import tensorflow as tf
print('Tensorflow Version: {}'.format(tf.__version__))

# check physical devices
tf.config.list_physical_devices()

Tensorflow Version: 2.3.1


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]

In [14]:
# https://www.analyticsvidhya.com/blog/2021/04/how-to-reduce-memory-usage-in-python-pandas/
# https://towardsdatascience.com/reducing-memory-usage-in-pandas-with-smaller-datatypes-b527635830af

calculateMemory = lambda frame : frame.memory_usage(deep = True).sum() / 1024 ** 2 # return usage in MB

def limitNumeric(frame : pd.DataFrame, verbose : bool = True, **kwargs) -> pd.DataFrame:
    """Given a DataFrame (frame) - the function considers each numeric columns (integer and/or float) and sets the data type to any of `np.dtypes` to Reduce Memory Usage"""
    
    if verbose:
        actual = calculateMemory(frame)
    
    frame = deepcopy(frame)
    
    # foreach column calculate the min and max value
    # and map the data to its relevant unit category - int8, int16, int32 or int64
    # by default - pandas treats each numeric column to its highest number base - int64/float64
    for col in TQ(frame.columns, desc = "converting dtypes"):
        c_min = frame[col].min()
        c_max = frame[col].max()
        
        if c_min > np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
            frame[col] = frame[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
            frame[col] = frame[col].astype(np.int16)
        if c_min > np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
            frame[col] = frame[col].astype(np.int32)
        else:
            frame[col] = frame[col].astype(np.int64)
            
    if verbose:
        final = calculateMemory(frame)
        print(f"Actual Size : {actual:.2f} MB | Final Size : {final:.2f} MB || Reduction Ration = {((actual - final) / actual) * 100:.2f}%")
        
    return frame

#### Loading Data

In [18]:
TRAIN_DATA = join(".", "train.csv")
EVALUATION_DATA = join(".", "test.csv")

In [15]:
def loadData(path : str, reduce_memory : bool = True, **kwargs) -> pd.DataFrame:
    """Load a CSV File into Memory, with Optional Arguments"""
    
    frame = pd.read_csv(path, **kwargs)
    
    if reduce_memory: # reduces the dtypes
        frame = limitNumeric(frame, **kwargs)
        
    return frame

In [20]:
dataTrain = loadData(TRAIN_DATA, index_col = 0)
dataTrain.head()

converting dtypes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55/55 [00:13<00:00,  4.21it/s]


Actual Size : 1708.98 MB | Final Size : 869.75 MB || Reduction Ration = 49.11%


Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3189,40,8,30,13,3270,206,234,193,4873,...,0,0,0,0,0,0,0,0,0,1
1,3026,182,5,280,29,3270,233,240,106,5423,...,0,0,0,0,0,0,0,0,0,2
2,3106,13,7,351,37,2914,208,234,137,5269,...,0,0,0,0,0,0,0,0,0,1
3,3022,276,13,192,16,3034,207,238,156,2866,...,0,0,0,0,0,0,0,0,0,2
4,2906,186,13,266,22,2916,231,231,154,2642,...,0,0,0,0,0,0,0,0,0,2


In [21]:
dataTest = loadData(EVALUATION_DATA, index_col = 0)
dataTest.head()

converting dtypes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:03<00:00, 17.19it/s]


Actual Size : 419.62 MB | Final Size : 213.62 MB || Reduction Ration = 49.09%


Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4000000,2763,78,20,377,88,3104,218,213,195,1931,...,0,0,0,0,0,0,0,0,0,0
4000001,2826,153,11,264,39,295,219,238,148,2557,...,0,0,0,0,0,0,0,0,0,0
4000002,2948,57,19,56,44,852,202,217,163,1803,...,0,0,1,0,0,0,0,0,0,0
4000003,2926,119,6,158,134,2136,234,240,142,857,...,0,0,0,0,0,0,0,0,0,0
4000004,2690,10,4,38,108,3589,213,221,229,431,...,0,0,0,0,0,0,0,0,0,0


#### Feature Engineering

In [30]:
soilTypes = {
    1   : "Cathedral family - Rock outcrop complex, extremely stony.",
    2   : "Vanet - Ratake families complex, very stony.",
    3   : "Haploborolis - Rock outcrop complex, rubbly.",
    4   : "Ratake family - Rock outcrop complex, rubbly.",
    5   : "Vanet family - Rock outcrop complex complex, rubbly.",
    6   : "Vanet - Wetmore families - Rock outcrop complex, stony.",
    7   : "Gothic family.",
    8   : "Supervisor - Limber families complex.",
    9   : "Troutville family, very stony.",
    10  : "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
    11  : "Bullwark - Catamount families - Rock land complex, rubbly.",
    12  : "Legault family - Rock land complex, stony.",
    13  : "Catamount family - Rock land - Bullwark family complex, rubbly.",
    14  : "Pachic Argiborolis - Aquolis complex.",
    15  : "unspecified in the USFS Soil and ELU Survey.",
    16  : "Cryaquolis - Cryoborolis complex.",
    17  : "Gateview family - Cryaquolis complex.",
    18  : "Rogert family, very stony.",
    19  : "Typic Cryaquolis - Borohemists complex.",
    20  : "Typic Cryaquepts - Typic Cryaquolls complex.",
    21  : "Typic Cryaquolls - Leighcan family, till substratum complex.",
    22  : "Leighcan family, till substratum, extremely bouldery.",
    23  : "Leighcan family, till substratum - Typic Cryaquolls complex.",
    24  : "Leighcan family, extremely stony.",
    25  : "Leighcan family, warm, extremely stony.",
    26  : "Granile - Catamount families complex, very stony.",
    27  : "Leighcan family, warm - Rock outcrop complex, extremely stony.",
    28  : "Leighcan family - Rock outcrop complex, extremely stony.",
    29  : "Como - Legault families complex, extremely stony.",
    30  : "Como family - Rock land - Legault family complex, extremely stony.",
    31  : "Leighcan - Catamount families complex, extremely stony.",
    32  : "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
    33  : "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
    34  : "Cryorthents - Rock land complex, extremely stony.",
    35  : "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
    36  : "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
    37  : "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
    38  : "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
    39  : "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
    40  : "Moran family - Cryorthents - Rock land complex, extremely stony."
}

In [31]:
rubblyStonyCats = [f"Soil_Type{k}" for k, v in soilTypes.items() if "rubbly" in v.lower()]

veryStonyCats = [f"Soil_Type{k}" for k, v in soilTypes.items() if "very stony" in v.lower()]
extremelyStonyCats = [f"Soil_Type{k}" for k, v in soilTypes.items() if "extremely stony" in v.lower()]

In [29]:
# calculate euclidean distance
distance = lambda xVals, yVals : np.array((xVals ** 2 + yVals ** 2) ** 0.5, dtype = np.float64)

In [33]:
def featureEngineering(frame : pd.DataFrame) -> pd.DataFrame:
    """Apply Feature Engineering on the Given DataFrame"""
    
    frame = deepcopy(frame) # keeps the original data intact - useful in quick analysis
    
    # calculate resource distances
    frame["distanceHydro"] = distance(frame["Horizontal_Distance_To_Hydrology"].values, frame["Vertical_Distance_To_Hydrology"].values)
    frame["distanceRoads"] = distance(frame["Elevation"].values, frame["Horizontal_Distance_To_Roadways"].values)
    frame["distanceFires"] = distance(frame["Elevation"].values, frame["Horizontal_Distance_To_Fire_Points"].values)
    
    # calculate different soil type and wilderness type count
    frame["soil_type_count_r"] = frame[rubblyStonyCats].sum(axis = 1)
    frame["soil_type_count_vs"] = frame[veryStonyCats].sum(axis = 1)
    frame["soil_type_count_es"] = frame[extremelyStonyCats].sum(axis = 1)
    frame["soil_type_count_s"] = frame[["soil_type_count_vs", "soil_type_count_es"]].sum(axis = 1)
    frame["soil_type_count_total"] = frame[[x for x in frame.columns if x.startswith("Soil_Type")]].sum(axis = 1)
    frame["wilderness_area_count"] = frame[[x for x in frame.columns if x.startswith("Wilderness_Area")]].sum(axis = 1)
    
    return frame

In [34]:
dataTrain = featureEngineering(dataTrain)
dataTest = featureEngineering(dataTest)

In [None]:
smote = SMOTE(k_neighbors = 7)
adasyn = ADASYN(n_neighbors = 14)

In [41]:
# bring the count around 195712 (class-3) using ADASYN for 4,6,7
Xy = dataTrain[(dataTrain.Cover_Type >= 3) & (dataTrain.Cover_Type != 5)]

X = Xy.drop(columns = "Cover_Type")
y = Xy.Cover_Type

del Xy # house-keeping

In [42]:
XSynth, ySynth = adasyn.fit_resample(X, y)
ySynth.value_counts()

4    195713
3    195712
7    195695
6    195667
Name: Cover_Type, dtype: int64

In [66]:
Xy = XSynth
Xy["Cover_Type"] = ySynth.values

Xy = pd.concat([
    Xy,
    dataTrain[dataTrain.Cover_Type == 1].sample(391500),
    dataTrain[dataTrain.Cover_Type == 2].sample(391500)
], ignore_index = True)

Xy.sample(5)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,distanceHydro,distanceRoads,distanceFires,soil_type_count_r,soil_type_count_vs,soil_type_count_es,soil_type_count_s,soil_type_count_total,wilderness_area_count,Cover_Type
695677,3150,81,9,1189,117,494,212,208,104,702,...,1194.948886,3189.625919,3228.717264,0,0,1,1,1,1,7
1106466,3190,83,7,319,-4,629,206,226,143,3663,...,319.025077,3251.421381,4857.331469,1,0,1,1,2,0,1
1550279,3059,25,22,283,-15,5343,204,258,157,731,...,283.397248,6156.714221,3145.129886,0,0,1,1,1,1,2
1109822,3212,5,25,199,9,178,182,219,113,6202,...,199.203414,3216.928349,6984.393173,0,0,0,0,0,1,1
1340420,2894,245,38,291,146,5989,195,245,224,295,...,325.571805,6651.568011,2908.996562,1,0,0,0,1,1,2


In [68]:
X = Xy.drop(columns = "Cover_Type")
y = Xy.Cover_Type

XSynth, ySynth = adasyn.fit_resample(X, y)
ySynth.value_counts()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\Anaconda3\envs\TensorFlow\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-68-110e96995997>", line 4, in <module>
    XSynth, ySynth = adasyn.fit_resample(X, y)
  File "D:\Anaconda3\envs\TensorFlow\lib\site-packages\imblearn\base.py", line 83, in fit_resample
    output = self._fit_resample(X, y)
  File "D:\Anaconda3\envs\TensorFlow\lib\site-packages\imblearn\over_sampling\_adasyn.py", line 128, in _fit_resample
    nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
  File "D:\Anaconda3\envs\TensorFlow\lib\site-packages\sklearn\neighbors\_base.py", line 749, in kneighbors
    chunked_results = list(
  File "D:\Anaconda3\envs\TensorFlow\lib\site-packages\sklearn\metrics\pairwise.py", line 1721, in pairwise_distances_chunked
    D_chunk = reduce_func(D_chunk, sl.start)
  File "D:\Anaconda3\envs\TensorFlow\lib\site-packages\skl

TypeError: object of type 'NoneType' has no len()

In [None]:
Xy = XSynth
Xy["Cover_Type"] = ySynth.values

parts = pd.concat([
    Xy[Xy.Cover_Type > 2],
    dataTrain[dataTrain.Cover_Type == 1].sample(500000),
    dataTrain[dataTrain.Cover_Type == 2].sample(500000)
])

X = parts.drop(columns = "Cover_Type")
y = parts.Cover_Type

XSynth, ySynth = smote.fit_resample(X, y)
ySynth.value_counts()