In [4]:
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgbimport
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

# Load Data

In [5]:
df10_ha = pd.read_csv('Export_Raw10_ha.csv')
df10_ha.head()

Unnamed: 0,OBJECTID,SRC_AGENCY,FIRE_ID,FIRENAME,LATITUDE,LONGITUDE,YEAR,MONTH,DAY,REP_DATE,...,slope2,aspect2,LULC2,NDVI2,pcp2,soilmois2,DEM_masked,numeric_Di,numeric__1,DistMain_r
0,1,BC,1977-GI0012,,55.948,-121.928,1977,4,24,4/24/1977,...,0.532337,132.274,4,0.460273,528.667,48.2479,728.0,12.3392,20.0886,5.53803
1,2,BC,1977-GI0013,,55.789,-121.941,1977,4,24,4/24/1977,...,3.747,203.629,4,0.447064,580.974,52.1207,703.0,20.5101,8.59824,18.8258
2,3,BC,1978-RD0003,,53.684,-125.743,1978,6,5,6/5/1978,...,1.6486,343.926,5,0.443464,492.436,59.6288,927.0,0.0,47.3237,2.82843
3,4,BC,1978-G90010,,53.09,-124.456,1978,5,18,5/18/1978,...,0.243532,17.1027,11,0.478671,414.282,39.3045,985.0,32.562,104.125,50.6782
4,5,BC,1978-K80016,,50.811,-122.106,1978,7,23,7/23/1978,...,6.9081,32.3578,4,0.56584,541.718,40.1938,558.0,153.592,22.1409,12.8917


In [7]:
df10_ha.columns

Index(['OBJECTID', 'SRC_AGENCY', 'FIRE_ID', 'FIRENAME', 'LATITUDE',
       'LONGITUDE', 'YEAR', 'MONTH', 'DAY', 'REP_DATE', 'ATTK_DATE',
       'OUT_DATE', 'DECADE', 'SIZE_HA', 'CAUSE', 'PROTZONE', 'FIRE_TYPE',
       'MORE_INFO', 'CFS_REF_ID', 'CFS_NOTE1', 'CFS_NOTE2', 'ACQ_DATE',
       'SRC_AGY2', 'ECOZONE', 'ECOZ_REF', 'ECOZ_NAME', 'ECOZ_NOM', 'TWI2',
       'temp_july2', 'slope2', 'aspect2', 'LULC2', 'NDVI2', 'pcp2',
       'soilmois2', 'DEM_masked', 'numeric_Di', 'numeric__1', 'DistMain_r'],
      dtype='object')

In [10]:
df10_ha.shape

(1720, 39)

In [11]:
# List of columns to drop (corrected for exact names)
col_to_drop = [
    'OBJECTID', 'SRC_AGENCY', 'FIRE_ID', 'FIRENAME', 'DAY', 'REP_DATE', 'ATTK_DATE', 
    'OUT_DATE', 'DECADE', 'CAUSE', 'PROTZONE', 'FIRE_TYPE', 'MORE_INFO', 'CFS_REF_ID', 
    'CFS_NOTE1', 'CFS_NOTE2', 'ACQ_DATE', 'SRC_AGY2', 'ECOZONE', 
    'ECOZ_REF', 'ECOZ_NAME', 'ECOZ_NOM'
]

# Drop the columns
df10_ha = df10_ha.drop(columns=col_to_drop, axis=1)

In [12]:
df10_ha.head()

Unnamed: 0,LATITUDE,LONGITUDE,YEAR,MONTH,SIZE_HA,TWI2,temp_july2,slope2,aspect2,LULC2,NDVI2,pcp2,soilmois2,DEM_masked,numeric_Di,numeric__1,DistMain_r
0,55.948,-121.928,1977,4,10.0,11.3633,22.8,0.532337,132.274,4,0.460273,528.667,48.2479,728.0,12.3392,20.0886,5.53803
1,55.789,-121.941,1977,4,10.0,9.41046,22.3,3.747,203.629,4,0.447064,580.974,52.1207,703.0,20.5101,8.59824,18.8258
2,53.684,-125.743,1978,6,10.0,11.8421,21.3,1.6486,343.926,5,0.443464,492.436,59.6288,927.0,0.0,47.3237,2.82843
3,53.09,-124.456,1978,5,10.0,14.9786,23.5,0.243532,17.1027,11,0.478671,414.282,39.3045,985.0,32.562,104.125,50.6782
4,50.811,-122.106,1978,7,10.0,12.5565,21.7,6.9081,32.3578,4,0.56584,541.718,40.1938,558.0,153.592,22.1409,12.8917


In [13]:
# Display rows where the 'CAUSE' column is null
null_SIZE_rows = df10_ha[df10_ha['SIZE_HA'].isnull()]
null_SIZE_rows


Unnamed: 0,LATITUDE,LONGITUDE,YEAR,MONTH,SIZE_HA,TWI2,temp_july2,slope2,aspect2,LULC2,NDVI2,pcp2,soilmois2,DEM_masked,numeric_Di,numeric__1,DistMain_r


In [14]:
df10_ha[df10_ha.isnull().any(axis= 1)]

Unnamed: 0,LATITUDE,LONGITUDE,YEAR,MONTH,SIZE_HA,TWI2,temp_july2,slope2,aspect2,LULC2,NDVI2,pcp2,soilmois2,DEM_masked,numeric_Di,numeric__1,DistMain_r


In [15]:
null_LULC_rows = df10_ha[df10_ha['LULC2'].isnull()]
null_LULC_rows

Unnamed: 0,LATITUDE,LONGITUDE,YEAR,MONTH,SIZE_HA,TWI2,temp_july2,slope2,aspect2,LULC2,NDVI2,pcp2,soilmois2,DEM_masked,numeric_Di,numeric__1,DistMain_r


In [17]:
df10_ha['LULC2'].value_counts()

4     1146
5      173
2      112
11     103
6       77
8       57
3       45
0        5
9        2
Name: LULC2, dtype: int64

In [20]:
df10_ha.shape

(1720, 17)

In [21]:
df10_ha['LULC2'].value_counts()

4     1146
5      173
2      112
11     103
6       77
8       57
3       45
0        5
9        2
Name: LULC2, dtype: int64

In [22]:
df10_ha['LULC2'] = df10_ha['LULC2'].replace(11, 5)
df10_ha['LULC2'] = df10_ha['LULC2'].replace(0, 3)

In [23]:
replace_values_LULC = {
    1: 'Urban_areas',
    2: 'Crop_land',
    3: 'Grass_land',
    4: 'Tree_covered',
    5: 'Shrub_covered',
    6: 'Herbaceous',
    8: 'Sparse_vegetation',
    9: 'Bare_soil',
    10: 'Snow',
    11: 'Water_bodies'
}

df10_ha['LULC2'] = df10_ha['LULC2'].replace(replace_values_LULC)

In [24]:
df10_ha['LULC2'].value_counts()

Tree_covered         1146
Shrub_covered         276
Crop_land             112
Herbaceous             77
Sparse_vegetation      57
Grass_land             50
Bare_soil               2
Name: LULC2, dtype: int64

In [25]:
# List of new column names
new_col_name = [
    'Latitude', 'Longitude', 'Year', 'Month', 'Size_ha', 'TWI', 'Temp_july',
    'Slope', 'Aspect', 'LULC', 'NDVI', 'Precipitation', 'SoilMoisture',
    'Elevation', 'Dist_Lakes', 'Dist_Roads', 'Dist_Rivers'
]

# Assign the new column names
df10_ha.columns = new_col_name

# Print current column names to verify they match the keys in data_types
print(df10_ha.columns)

# Dictionary mapping columns to their new data types
data_types = {
    'Latitude': float,
    'Longitude': float,
    'Year': int,
    'Month': int,
    'Size_ha': float,
    'TWI': float,
    'Temp_july': float,
    'Slope': float,
    'Aspect': float,
    'LULC': 'category',
    'NDVI': float,
    'Precipitation': float,
    'SoilMoisture': float,
    'Elevation': float,
    'Dist_Lakes': float,
    'Dist_Roads': float,
    'Dist_Rivers': float
}

# Change the data types of the columns
df10_ha = df10_ha.astype(data_types)

# Display the first few rows to confirm changes
df10_ha.head()


Index(['Latitude', 'Longitude', 'Year', 'Month', 'Size_ha', 'TWI', 'Temp_july',
       'Slope', 'Aspect', 'LULC', 'NDVI', 'Precipitation', 'SoilMoisture',
       'Elevation', 'Dist_Lakes', 'Dist_Roads', 'Dist_Rivers'],
      dtype='object')


Unnamed: 0,Latitude,Longitude,Year,Month,Size_ha,TWI,Temp_july,Slope,Aspect,LULC,NDVI,Precipitation,SoilMoisture,Elevation,Dist_Lakes,Dist_Roads,Dist_Rivers
0,55.948,-121.928,1977,4,10.0,11.3633,22.8,0.532337,132.274,Tree_covered,0.460273,528.667,48.2479,728.0,12.3392,20.0886,5.53803
1,55.789,-121.941,1977,4,10.0,9.41046,22.3,3.747,203.629,Tree_covered,0.447064,580.974,52.1207,703.0,20.5101,8.59824,18.8258
2,53.684,-125.743,1978,6,10.0,11.8421,21.3,1.6486,343.926,Shrub_covered,0.443464,492.436,59.6288,927.0,0.0,47.3237,2.82843
3,53.09,-124.456,1978,5,10.0,14.9786,23.5,0.243532,17.1027,Shrub_covered,0.478671,414.282,39.3045,985.0,32.562,104.125,50.6782
4,50.811,-122.106,1978,7,10.0,12.5565,21.7,6.9081,32.3578,Tree_covered,0.56584,541.718,40.1938,558.0,153.592,22.1409,12.8917


## Multi-collineariy using VIF

In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt

def calculate_vif(df, exclude_columns=None):
    # Set default for exclude_columns if not provided
    if exclude_columns is None:
        exclude_columns = []

    # Always exclude 'Wildfire' in addition to other specified columns
    always_exclude = ['Wildfire', 'Year', 'Latitude', 'Longitude', 'Month', 'Size_ha']
    all_exclusions = set(exclude_columns).union(always_exclude)

    # Drop excluded columns
    df_filtered = df.drop(columns=all_exclusions, errors='ignore')  # 'errors=ignore' prevents errors if the column doesn't exist

    # Convert categorical columns to numerical codes
    df_filtered = df_filtered.apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)

    # Calculate VIF for each variable that remains
    vifs = [variance_inflation_factor(df_filtered.values, i) for i in range(df_filtered.shape[1])]
    tolerance = [1/vif for vif in vifs]

    # Prepare the VIF DataFrame
    vif_data = pd.DataFrame({
        'VIF': vifs,
        'Tolerance': tolerance
    }, index=df_filtered.columns)

    return vif_data

# Specify additional columns to exclude as needed
additional_excludes = []

# Calculate VIF excluding specified columns
vif_data = calculate_vif(df10_ha, exclude_columns=additional_excludes)

print(vif_data)


                     VIF  Tolerance
TWI            46.267719   0.021613
Temp_july      76.231710   0.013118
Slope           2.544621   0.392986
Aspect          3.969791   0.251902
LULC           12.846497   0.077842
NDVI           27.370094   0.036536
Precipitation  10.432576   0.095854
SoilMoisture   10.013582   0.099864
Elevation       4.505514   0.221950
Dist_Lakes      2.631608   0.379996
Dist_Roads      2.388849   0.418612
Dist_Rivers     2.659778   0.375971


In [28]:
df10_ha.head()

Unnamed: 0,Latitude,Longitude,Year,Month,Size_ha,TWI,Temp_july,Slope,Aspect,LULC,NDVI,Precipitation,SoilMoisture,Elevation,Dist_Lakes,Dist_Roads,Dist_Rivers
0,55.948,-121.928,1977,4,10.0,11.3633,22.8,0.532337,132.274,Tree_covered,0.460273,528.667,48.2479,728.0,12.3392,20.0886,5.53803
1,55.789,-121.941,1977,4,10.0,9.41046,22.3,3.747,203.629,Tree_covered,0.447064,580.974,52.1207,703.0,20.5101,8.59824,18.8258
2,53.684,-125.743,1978,6,10.0,11.8421,21.3,1.6486,343.926,Shrub_covered,0.443464,492.436,59.6288,927.0,0.0,47.3237,2.82843
3,53.09,-124.456,1978,5,10.0,14.9786,23.5,0.243532,17.1027,Shrub_covered,0.478671,414.282,39.3045,985.0,32.562,104.125,50.6782
4,50.811,-122.106,1978,7,10.0,12.5565,21.7,6.9081,32.3578,Tree_covered,0.56584,541.718,40.1938,558.0,153.592,22.1409,12.8917
