In [None]:
import csv
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
FULL_DATASET = '../covertype.csv'
SMALL_DATASET= '../covertype_small.csv'
TRAINING_DATASET='../covertype_training.csv'
TRAINING_DATASET_WITH_MISSING = '../covertype_training_missing.csv'
EVALUATION_DATASET='../covertype_evaluation.csv'
EVALUATION_DATASET_WITH_ANOMALIES='../covertype_evaluation_anomalies.csv'
SERVING_DATASET='../covertype_serving.csv'

ORIGINAL_DATASET_PATH = 'gs://workshop-datasets/covertype/orig/covtype.data'

In [None]:
df = pd.read_csv(ORIGINAL_DATASET_PATH, header=None)
print(df.shape)
df.head()

In [None]:
soil_type = [
"1", "C2702", "Cathedral family - Rock outcrop complex, extremely stony.",
"2", "C2703", "Vanet - Ratake families complex, very stony.",
"3", "C2704", "Haploborolis - Rock outcrop complex, rubbly.",
"4", "C2705", "Ratake family - Rock outcrop complex, rubbly.",
"5", "C2706", "Vanet family - Rock outcrop complex complex, rubbly.",
"6", "C2717", "Vanet - Wetmore families - Rock outcrop complex, stony.",
"7", "C3501", "Gothic family.",
"8", "C3502", "Supervisor - Limber families complex.",
"9", "C4201", "Troutville family, very stony.",
"10", "C4703", "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
"11", "C4704", "Bullwark - Catamount families - Rock land complex, rubbly.",
"12", "C4744", "Legault family - Rock land complex, stony.",
"13", "C4758", "Catamount family - Rock land - Bullwark family complex, rubbly.",
"14", "C5101", "Pachic Argiborolis - Aquolis complex.",
"15", "C5151", "unspecified in the USFS Soil and ELU Survey.",
"16", "C6101", "Cryaquolis - Cryoborolis complex.",
"17", "C6102", "Gateview family - Cryaquolis complex.",
"18", "C6731", "Rogert family, very stony.",
"19", "C7101", "Typic Cryaquolis - Borohemists complex.",
"20", "C7102", "Typic Cryaquepts - Typic Cryaquolls complex.",
"21", "C7103", "Typic Cryaquolls - Leighcan family, till substratum complex.",
"22", "C7201", "Leighcan family, till substratum, extremely bouldery.",
"23", "C7202", "Leighcan family, till substratum - Typic Cryaquolls complex.",
"24", "C7700", "Leighcan family, extremely stony.",
"25", "C7701", "Leighcan family, warm, extremely stony.",
"26", "C7702", "Granile - Catamount families complex, very stony.",
"27", "C7709", "Leighcan family, warm - Rock outcrop complex, extremely stony.",
"28", "C7710", "Leighcan family - Rock outcrop complex, extremely stony.",
"29", "C7745", "Como - Legault families complex, extremely stony.",
"30", "C7746", "Como family - Rock land - Legault family complex, extremely stony.",
"31", "C7755", "Leighcan - Catamount families complex, extremely stony.",
"32", "C7756", "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
"33", "C7757", "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
"34", "C7790", "Cryorthents - Rock land complex, extremely stony.",
"35", "C8703", "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
"36", "C8707", "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
"37", "C8708", "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
"38", "C8771", "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
"39", "C8772", "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
"40", "C8776", "Moran family - Cryorthents - Rock land complex, extremely stony.",
]

wilderness_area = [
"Rawah", "Rawah Wilderness Area",
"Neota", "Neota Wilderness Area",
"Commanche", "Comanche Peak Wilderness Area",
"Cache", "Cache la Poudre Wilderness Area"
]

In [None]:
soil = df.loc[:, 14:53].apply(lambda x: soil_type[1::3][x.to_numpy().nonzero()[0][0]], axis=1)
soil

In [None]:
wilderness = df.loc[:, 10:13].apply(lambda x: wilderness_area[0::2][x.to_numpy().nonzero()[0][0]], axis=1)
wilderness

In [None]:
COLUMN_NAMES = [
    'Elevation', 
    'Aspect', 
    'Slope', 
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area',
    'Soil_Type',
    'Cover_Type']

df_full = pd.concat([df.loc[:, 0:9], wilderness, soil, df.loc[:, 54]], axis=1, ignore_index=True)
df_full.columns = COLUMN_NAMES
df_full

In [None]:

df_full['Cover_Type'] = df_full['Cover_Type'] - 1

In [None]:
df_full.to_csv(FULL_DATASET, header=True, index=False)

In [None]:

!head $FULL_DATASET

In [None]:
df_full = df = pd.read_csv(FULL_DATASET, dtype={'Soil_Type': object})
df_full

In [None]:
df_full.Soil_Type.value_counts()

In [None]:
df_5151 = df_full[df_full['Soil_Type']=='C5151']
df_no_5151 = df_full[df_full['Soil_Type']!='C5151']

In [None]:
df_5151

In [None]:
df_no_5151

In [None]:
df_small, df_other = train_test_split(df_no_5151, train_size=100000, stratify=df_no_5151.Cover_Type)

In [None]:
df_train, df_other = train_test_split(df_no_5151, train_size=431009, stratify=df_no_5151.Cover_Type)
df_evaluate, df_serving = train_test_split(df_other, train_size=75000, stratify=df_other.Cover_Type)
df_serving = df_serving.drop(columns=['Cover_Type'])
print(df_train.shape)
print(df_evaluate.shape)
print(df_serving.shape)

In [None]:
df_train_missing = df_train.reset_index(drop=True)
df_train_missing.loc[0:8999, 'Horizontal_Distance_To_Hydrology'] = None
df_train_missing

In [None]:
df_evaluate_anomalies = df_evaluate.reset_index(drop=True)
df_evaluate_anomalies.loc[0:4, 'Slope'] = 110
df_evaluate_anomalies = pd.concat([df_evaluate_anomalies, df_5151])
df_evaluate_anomalies

In [None]:
df_evaluate_anomalies.Soil_Type.value_counts()

In [None]:
df_train.to_csv(TRAINING_DATASET, header=True, index=False)
df_small.to_csv(SMALL_DATASET, header=True, index=False)
df_train_missing.to_csv(TRAINING_DATASET_WITH_MISSING, header=True, index=False)
df_evaluate.to_csv(EVALUATION_DATASET, header=True, index=False)
df_evaluate_anomalies.to_csv(EVALUATION_DATASET_WITH_ANOMALIES, header=True, index=False)
df_serving.to_csv(SERVING_DATASET, header=True, index=False)

In [None]:
!gsutil cp $FULL_DATASET gs://workshop-datasets/covertype/full/dataset.csv
!gsutil cp $SMALL_DATASET gs://workshop-datasets/covertype/small/dataset.csv
!gsutil cp $TRAINING_DATASET gs://workshop-datasets/covertype/training/dataset.csv
!gsutil cp $TRAINING_DATASET_WITH_MISSING gs://workshop-datasets/covertype/training_missing/dataset.csv
!gsutil cp $EVALUATION_DATASET gs://workshop-datasets/covertype/evaluation/dataset.csv
!gsutil cp $EVALUATION_DATASET_WITH_ANOMALIES gs://workshop-datasets/covertype/evaluation_anomalies/dataset.csv
!gsutil cp $SERVING_DATASET gs://workshop-datasets/covertype/serving/dataset.csv

In [None]:
import os
import requests
# Directory of the raw data files
_data_root = './data/covertype'
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv'

In [None]:
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export=\download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)