In [1]:
from collections import Counter
from pathlib import Path
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set folder paths for outputs

In [2]:
import sys
sys.path.append("../../..")

In [3]:
from utils.auxSamplingStudy import *

In [4]:
from analysis.data.forest.config import DATA, MODELS, REPORTS, idbin, idcat, idnum

# Read the file

In [5]:
df = pd.read_csv(DATA / 'forest.data.gz', compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


# The raw data is one-hot encoded. We reverse encoded the data, to have categorical features back

### Prepare categorical column 'Wilderness_Area' from encoded column 10,11, 12 and 13

In [7]:
df['Wilderness_Area'] = 999

In [8]:
is_10_1 = df.iloc[:, 10] == 1
is_11_1 = df.iloc[:, 11] == 1
is_12_1 = df.iloc[:, 12] == 1
is_13_1 = df.iloc[:, 13] == 1

In [9]:
df.loc[is_10_1, 'Wilderness_Area'] = 0
df.loc[is_11_1, 'Wilderness_Area'] = 1
df.loc[is_12_1, 'Wilderness_Area'] = 2
df.loc[is_13_1, 'Wilderness_Area'] = 3

# Prepare categorical column 'Soil_Type' from encoded column 14 to 53

In [10]:
Soil_Type_df = df.iloc[:, 14:54]

In [11]:
def get_level(row):
    for c in Soil_Type_df.columns:
        if row[c]==1:
            return c - 14
        
df['Soil_Type'] = df.apply(get_level, axis=1)

In [12]:
Counter(df.Wilderness_Area)

Counter({0: 260796, 2: 253364, 3: 36968, 1: 29884})

In [13]:
columns_selected = [i for i in range(10)]

In [14]:
columns_selected.extend(('Wilderness_Area', 'Soil_Type', 54))

In [15]:
df = df[columns_selected]

In [16]:
df.columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
             'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
              'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Soil_Type', 'Cover_Type']


In [17]:
anovap_value = list()
for cols in ['Soil_Type', 'Wilderness_Area']:
    statistic, p = stats.f_oneway(df[cols], df['Cover_Type'])
    anovap_value.append(tuple([cols, p]))
    
print(anovap_value)

[('Soil_Type', 0.0), ('Wilderness_Area', 0.0)]


In [18]:
from scipy.stats import chi2_contingency
chi2p_value = list()
for cols in ['Soil_Type', 'Wilderness_Area']:
    obs = pd.crosstab(df[cols], df['Cover_Type'])
    g, p, dof, expctd = chi2_contingency(obs)
    chi2p_value.append(tuple([cols, p]))
    
print(chi2p_value)

[('Soil_Type', 0.0), ('Wilderness_Area', 0.0)]


In [19]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,0,28,5
1,2590,56,2,212,-6,390,220,235,151,6225,0,28,5
2,2804,139,9,268,65,3180,234,238,135,6121,0,11,2
3,2785,155,18,242,118,3090,238,238,122,6211,0,29,2
4,2595,45,2,153,-1,391,220,234,150,6172,0,28,5


In [20]:
df.drop_duplicates(inplace=True)

In [21]:
df.shape

(581012, 13)

In [22]:
df.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area                       0
Soil_Type                             0
Cover_Type                            0
dtype: int64

In [23]:
Counter(df.Cover_Type)

Counter({5: 9493, 2: 283301, 1: 211840, 7: 20510, 3: 35754, 6: 17367, 4: 2747})

### There are 7 target labels of the dataset- 2 and 6 are randomly chosen among them(since this algorithm is built for binary classification)

In [24]:
df = df.loc[df['Cover_Type'].isin([2, 6])]
df['Cover_Type'].replace({2: 0, 6: 1},inplace = True)

In [25]:
len(df.Soil_Type.unique())

39

In [26]:
Counter(df.Cover_Type)

Counter({0: 283301, 1: 17367})

In [27]:
df_feature= df.drop('Cover_Type',1)
df_target= df[['Cover_Type']]
df_target = df_target.rename(columns={'Cover_Type': 'target'})
df_feature.index = pd.RangeIndex(len(df_feature.index))
df_target.index = pd.RangeIndex(len(df_target.index))

In [28]:
col_list = list(df_feature.columns)

In [29]:
df_feature.shape

(300668, 12)

In [30]:
test_size = 1 - 2000 / df_feature.shape[0]

In [31]:
np.sum(df_target)/df_target.shape[0]

target    0.057761
dtype: float64

In [32]:
#Sampling: Added by me
df_feature, aux1, df_target, aux2 = train_test_split(df_feature, df_target, test_size=test_size, random_state=12)

In [33]:
df_feature.shape


(2000, 12)

In [34]:
np.sum(df_target)

target    130
dtype: int64

In [35]:
np.sum(df_target)/df_target.shape[0]


target    0.065
dtype: float64

In [36]:
1/16

0.0625

In [37]:
# Split the raw data into train and test set. Split ratio = 75:25

X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.25, random_state=12)

In [38]:
col_list = list(X_train.columns)
X_train.index = pd.RangeIndex(len(X_train.index))
y_train.index = pd.RangeIndex(len(y_train.index))
X_test.index = pd.RangeIndex(len(X_test.index))
y_test.index = pd.RangeIndex(len(y_test.index))

In [39]:
## Apply standard scaler on the features , so that euclidean distance calculation in SMOTE is not biased

name_columns = X_train.columns[idnum]

sc = StandardScaler()
X_train[name_columns] = sc.fit_transform(X_train[name_columns])
X_test[name_columns] = sc.transform(X_test[name_columns])

In [40]:
X_train.to_csv(DATA / "forest_X_train.csv", index = False)
y_train.to_csv(DATA / "forest_y_train.csv", index = False)
X_test.to_csv(DATA / "forest_X_test.csv", index = False)
y_test.to_csv(DATA / "forest_y_test.csv", index = False)