In [1]:
import re
from sklearn.datasets import fetch_covtype # dataset
from sklearn.model_selection import train_test_split # split dataset into training/test sets
from imblearn.under_sampling import RandomUnderSampler 
from collections import defaultdict, Counter
import pandas as pd
import numpy as np

In [2]:
# download the dataset from:
# "http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
cover_type = fetch_covtype() 

In [3]:
X = cover_type.data
y = cover_type.target

print('Original dataset shape:\n {}'.format(Counter(y)))

Original dataset shape:
 Counter({2: 283301, 1: 211840, 3: 35754, 7: 20510, 6: 17367, 5: 9493, 4: 2747})


### As seen above,  the dataset is heavily imbalanced. Using a Random Undersampler, I undersample the imbalanced classes w/ replacement.


In [4]:
ros = RandomUnderSampler(random_state=42, return_indices=True, replacement=True)
X_res, y_res, idx_resampled = ros.fit_sample(X, y)
print('Resampled dataset shape: \n {}'.format(Counter(y_res)))

Resampled dataset shape: 
 Counter({4: 2747, 5: 2747, 2: 2747, 1: 2747, 7: 2747, 3: 2747, 6: 2747})


In [5]:
"Reduce to {:.2f}% of the orignal dataset.".format(X_res.shape[0]/X.shape[0] * 100)

'Reduce to 3.31% of the orignal dataset.'

# To make things even harder we will be using only ≈3% of the complete data. Who doesn't like a challenge?

In [6]:
soil_types = { 
    "2702": "Cathedral family - Rock outcrop complex, extremely stony.",
    "2703": "Vanet - Ratake families complex, very stony.", 
    "2704": "Haploborolis - Rock outcrop complex, rubbly.",
    "2705": "Ratake family - Rock outcrop complex, rubbly.",
    "2706": "Vanet family - Rock outcrop complex complex, rubbly.",
    "2717": "Vanet - Wetmore families - Rock outcrop complex, stony.",
    "3501": "Gothic family.",
    "3502": "Supervisor - Limber families complex.",
    "4201": "Troutville family, very stony.",
    "4703": "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
    "4704": "Bullwark - Catamount families - Rock land complex, rubbly.",
    "4744": "Legault family - Rock land complex, stony.",
    "4758": "Catamount family - Rock land - Bullwark family complex, rubbly.",
    "5101": "Pachic Argiborolis - Aquolis complex.",
    "5151": "not_in_survey", # "unspecified in the USFS Soil and ELU Survey.",
    "6101": "Cryaquolis - Cryoborolis complex.",
    "6102": "Gateview family - Cryaquolis complex.",
    "6731": "Rogert family, very stony.",
    "7101": "Typic Cryaquolis - Borohemists complex.",
    "7102": "Typic Cryaquepts - Typic Cryaquolls complex.",
    "7103": "Typic Cryaquolls - Leighcan family, till substratum complex.",
    "7201": "Leighcan family, till substratum, extremely bouldery.",
    "7202": "Leighcan family, till substratum - Typic Cryaquolls complex.",
    "7700": "Leighcan family, extremely stony.",
    "7701": "Leighcan family, warm, extremely stony.",
    "7702": "Granile - Catamount families complex, very stony.",
    "7709": "Leighcan family, warm - Rock outcrop complex, extremely stony.",
    "7710": "Leighcan family - Rock outcrop complex, extremely stony.",
    "7745": "Como - Legault families complex, extremely stony.",
    "7746": "Como family - Rock land - Legault family complex, extremely stony.",
    "7755": "Leighcan - Catamount families complex, extremely stony.",
    "7756": "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
    "7757": "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
    "7790": "Cryorthents - Rock land complex, extremely stony.",
    "8703": "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
    "8707": "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
    "8708": "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
    "8771": "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
    "8772": "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
    "8776": "Moran family - Cryorthents - Rock land complex, extremely stony."}

### Since a majority of the descriptions above could be considered collocations (e.g. "Cathedral family" or "Rock land complex"), I simplify the data and join these words to become a single feature.

In [7]:
for k, v in soil_types.items():
    fun = re.split(' - |, ', v.lower().replace(".", ""))
    colocations = [i.replace(" ", "_") for i in fun]
    soil_types[k] = " ".join(colocations)

In [8]:
soil_types

{'2702': 'cathedral_family rock_outcrop_complex extremely_stony',
 '2703': 'vanet ratake_families_complex very_stony',
 '2704': 'haploborolis rock_outcrop_complex rubbly',
 '2705': 'ratake_family rock_outcrop_complex rubbly',
 '2706': 'vanet_family rock_outcrop_complex_complex rubbly',
 '2717': 'vanet wetmore_families rock_outcrop_complex stony',
 '3501': 'gothic_family',
 '3502': 'supervisor limber_families_complex',
 '4201': 'troutville_family very_stony',
 '4703': 'bullwark catamount_families rock_outcrop_complex rubbly',
 '4704': 'bullwark catamount_families rock_land_complex rubbly',
 '4744': 'legault_family rock_land_complex stony',
 '4758': 'catamount_family rock_land bullwark_family_complex rubbly',
 '5101': 'pachic_argiborolis aquolis_complex',
 '5151': 'not_in_survey',
 '6101': 'cryaquolis cryoborolis_complex',
 '6102': 'gateview_family cryaquolis_complex',
 '6731': 'rogert_family very_stony',
 '7101': 'typic_cryaquolis borohemists_complex',
 '7102': 'typic_cryaquepts typic_c

# Next, I add new features by extracting additional information from map unit keys. 
### With a little digging around, I was able to make sense of the soil type's ELU codes (4 digit code associated with each soil type). No pun intended.

### First digit:  climatic zone   
### Second digit:  geologic zones

In [9]:
# First digit:  climatic zone   
first_digit = { "1": "lower montane dry",
                "2": "lower montane",          
                "3": "montane dry",            
                "4": "montane",                
                "5": "montane dry and montane",
                "6": "montane and subalpine",
                "7": "subalpine",  
                "8": "alpine" 
              }  

# Second digit:  geologic zones
second_digit = {"1": "alluvium",
                "2": "glacial",
                "3": "shale",
                "4": "sandstone",
                "5": "mixed sedimentary",
                "6": "unspecified in the USFS ELU Survey", #not_in_survey
                "7": "igneous and metamorphic",
                "8": "volcanic"
               }

# The third and fourth ELU digits are unique to the mapping unit and 
# have no special meaning to the climatic or geologic zones.

# Transform these new features into collocations

In [10]:
for k, v in first_digit.items():
    first_digit[k] = v.replace(" ", "_")

first_digit # here is another good example of collocations 

{'1': 'lower_montane_dry',
 '2': 'lower_montane',
 '3': 'montane_dry',
 '4': 'montane',
 '5': 'montane_dry_and_montane',
 '6': 'montane_and_subalpine',
 '7': 'subalpine',
 '8': 'alpine'}

In [11]:
for k, v in second_digit.items():
    second_digit[k] = v.replace(" ", "_")
    
second_digit

{'1': 'alluvium',
 '2': 'glacial',
 '3': 'shale',
 '4': 'sandstone',
 '5': 'mixed_sedimentary',
 '6': 'unspecified_in_the_USFS_ELU_Survey',
 '7': 'igneous_and_metamorphic',
 '8': 'volcanic'}

In [12]:
soil_types_extend = defaultdict(str, soil_types)

for k in soil_types_extend.keys():
    climatic = "climatic_zone_" + first_digit.get(k[0])
    geologic = "geologic_zone_" + second_digit.get(k[1])
    soil_types_extend[k] += " " + climatic + " " + geologic

# Repeat for Wilderness Areas

In [13]:
# The wilderness areas are
wilderness_areas =   {'Wilderness_Area1': "Rawah Wilderness Area", 
                      'Wilderness_Area2': "Neota Wilderness Area",
                      'Wilderness_Area3': "Comanche Peak Wilderness Area",
                      'Wilderness_Area4': "Cache la Poudre Wilderness Area"}

In [14]:
for k, v in wilderness_areas.items():
    wilderness_areas[k] = v.lower().replace(" ", "_")

wilderness_areas

{'Wilderness_Area1': 'rawah_wilderness_area',
 'Wilderness_Area2': 'neota_wilderness_area',
 'Wilderness_Area3': 'comanche_peak_wilderness_area',
 'Wilderness_Area4': 'cache_la_poudre_wilderness_area'}

In [15]:
# Convert the dictionaries' values into lists for transforming a categorical dataset into a text dataset  

soil_cols = list(soil_types_extend.values())
wilderness_cols = list(wilderness_areas.values())

In [16]:
# reminder as to which columns are being eliminated
most_important_cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points'] 

# Who need the most important features anyway? 

In [17]:
# Slice out the most important features
# First, start by transforming the wilderness areas
wild_test = pd.DataFrame(X_res[:, 10:14], columns=wilderness_cols).tail(10)
wild_test

Unnamed: 0,rawah_wilderness_area,neota_wilderness_area,comanche_peak_wilderness_area,cache_la_poudre_wilderness_area
19219,0.0,0.0,1.0,0.0
19220,0.0,0.0,1.0,0.0
19221,0.0,0.0,0.0,1.0
19222,0.0,0.0,1.0,0.0
19223,0.0,0.0,1.0,0.0
19224,0.0,0.0,0.0,1.0
19225,0.0,0.0,1.0,0.0
19226,0.0,0.0,0.0,1.0
19227,0.0,0.0,0.0,1.0
19228,0.0,0.0,0.0,1.0


### Example of how the transformation works
- Replace only the "1" values with the associated collocation

In [18]:
print(pd.Series(wild_test.idxmax(axis=1)))

19219      comanche_peak_wilderness_area
19220      comanche_peak_wilderness_area
19221    cache_la_poudre_wilderness_area
19222      comanche_peak_wilderness_area
19223      comanche_peak_wilderness_area
19224    cache_la_poudre_wilderness_area
19225      comanche_peak_wilderness_area
19226    cache_la_poudre_wilderness_area
19227    cache_la_poudre_wilderness_area
19228    cache_la_poudre_wilderness_area
dtype: object


### Rinse and repeat with the 40 soil types

In [19]:
soil_test = pd.DataFrame(X_res[:, 14:], columns=soil_cols).tail(10)
soil_test

Unnamed: 0,cathedral_family rock_outcrop_complex extremely_stony climatic_zone_lower_montane geologic_zone_igneous_and_metamorphic,vanet ratake_families_complex very_stony climatic_zone_lower_montane geologic_zone_igneous_and_metamorphic,haploborolis rock_outcrop_complex rubbly climatic_zone_lower_montane geologic_zone_igneous_and_metamorphic,ratake_family rock_outcrop_complex rubbly climatic_zone_lower_montane geologic_zone_igneous_and_metamorphic,vanet_family rock_outcrop_complex_complex rubbly climatic_zone_lower_montane geologic_zone_igneous_and_metamorphic,vanet wetmore_families rock_outcrop_complex stony climatic_zone_lower_montane geologic_zone_igneous_and_metamorphic,gothic_family climatic_zone_montane_dry geologic_zone_mixed_sedimentary,supervisor limber_families_complex climatic_zone_montane_dry geologic_zone_mixed_sedimentary,troutville_family very_stony climatic_zone_montane geologic_zone_glacial,bullwark catamount_families rock_outcrop_complex rubbly climatic_zone_montane geologic_zone_igneous_and_metamorphic,...,leighcan catamount_families_complex extremely_stony climatic_zone_subalpine geologic_zone_igneous_and_metamorphic,catamount_family rock_outcrop leighcan_family_complex extremely_stony climatic_zone_subalpine geologic_zone_igneous_and_metamorphic,leighcan catamount_families rock_outcrop_complex extremely_stony climatic_zone_subalpine geologic_zone_igneous_and_metamorphic,cryorthents rock_land_complex extremely_stony climatic_zone_subalpine geologic_zone_igneous_and_metamorphic,cryumbrepts rock_outcrop cryaquepts_complex climatic_zone_alpine geologic_zone_igneous_and_metamorphic,bross_family rock_land cryumbrepts_complex extremely_stony climatic_zone_alpine geologic_zone_igneous_and_metamorphic,rock_outcrop cryumbrepts cryorthents_complex extremely_stony climatic_zone_alpine geologic_zone_igneous_and_metamorphic,leighcan moran_families cryaquolls_complex extremely_stony climatic_zone_alpine geologic_zone_igneous_and_metamorphic,moran_family cryorthents leighcan_family_complex extremely_stony climatic_zone_alpine geologic_zone_igneous_and_metamorphic,moran_family cryorthents rock_land_complex extremely_stony climatic_zone_alpine geologic_zone_igneous_and_metamorphic
19219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19222,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19224,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
wild_test.idxmax(axis=1) + " " + soil_test.idxmax(axis=1)

19219    comanche_peak_wilderness_area bullwark catamou...
19220    comanche_peak_wilderness_area gateview_family ...
19221    cache_la_poudre_wilderness_area bullwark catam...
19222    comanche_peak_wilderness_area ratake_family ro...
19223    comanche_peak_wilderness_area catamount_family...
19224    cache_la_poudre_wilderness_area vanet wetmore_...
19225    comanche_peak_wilderness_area bullwark catamou...
19226    cache_la_poudre_wilderness_area bullwark catam...
19227    cache_la_poudre_wilderness_area cryaquolis cry...
19228    cache_la_poudre_wilderness_area bullwark catam...
dtype: object

# Putting it all together

In [21]:
wild_df = pd.DataFrame(X_res[:, 10:14], columns=wilderness_cols)

soil_df = pd.DataFrame(X_res[:, 14:], columns=soil_cols)

X_wild_soil = wild_df.idxmax(axis=1) + " " + soil_df.idxmax(axis=1)

### Example of what one datapoint look like
- The beautiful 54 features have been reduced to 5. 
- Note that without the additional features I created, there would only be 3 features. 

In [22]:
X_wild_soil[0]

'cache_la_poudre_wilderness_area gateview_family cryaquolis_complex climatic_zone_montane_and_subalpine geologic_zone_alluvium'

### From 54 features ---> 5
### We can call this dimensionality reduction, right???

In [23]:
targets_res = pd.Series(y_res) #targets from random undersampler 
X_wild_soil_w_targets = pd.DataFrame({"wild_soil": X_wild_soil, "soil_type":targets_res})
X_wild_soil_w_targets.head()

Unnamed: 0,soil_type,wild_soil
0,4,cache_la_poudre_wilderness_area gateview_famil...
1,4,cache_la_poudre_wilderness_area gateview_famil...
2,4,cache_la_poudre_wilderness_area gateview_famil...
3,4,cache_la_poudre_wilderness_area gateview_famil...
4,4,cache_la_poudre_wilderness_area gateview_famil...


# Save the new feature matrix for future exploration

In [24]:
X_wild_soil_w_targets.to_csv("collocation_df.csv")