In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

fires = pd.read_csv("wildfires.csv").drop(["_id", "OBJECTID", "APN (parcel)", "x", "y"], axis=1).query("County == 'San Diego'")
fires["Incident Start Date"] = pd.to_datetime(fires["Incident Start Date"])
fires["Month"] = fires["Incident Start Date"].dt.month
fires["* City"] = fires["* City"].fillna("Non-city")

cat_cols = ["* Street Type (e.g. road, drive, lane, etc.)", "* City", "County", "* Structure Type",
            "Structure Category", "* Roof Construction", "* Eaves", "* Vent Screen", "* Exterior Siding",
            "* Window Pane", "* Deck/Porch On Grade", "* Deck/Porch Elevated",
            "* Patio Cover/Carport Attached to Structure", "* Fence Attached to Structure"]
num_cols = ["Year Built (parcel)", "Latitude", "Longitude", "Month"]

preproc = ColumnTransformer(
    transformers = [
        ("categorical", OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols),
        ("numerical", "passthrough", num_cols)
    ],
    remainder="drop"
)

data = fires[cat_cols + num_cols + ["* Damage"]].dropna()
# X_train, X_test, y_train, y_test = (
#     train_test_split(data[cat_cols + num_cols], data["* Damage"], test_size=0.25, random_state=1)
# )
# processing = Pipeline([("preproc", preproc), ("tree", RandomForestClassifier())])
# processing.fit(X_train, y_train)
# processing.score(X_test, y_test) # 0.8759124087591241

  fires = pd.read_csv("wildfires.csv").drop(["_id", "OBJECTID", "APN (parcel)", "x", "y"], axis=1).query("County == 'San Diego'")
  fires["Incident Start Date"] = pd.to_datetime(fires["Incident Start Date"])


In [44]:
cols = []
curr = []
for c in cat_cols:
    curr = list(np.unique(data[c]))
    if "Unknown" in curr:
        curr[curr.index("Unknown")] = f"Unknown {c}"
    cols += curr[1:]
    curr = []
cols += num_cols
vars = pd.DataFrame(preproc.fit_transform(data).toarray()).rename(columns={x:cols[x] for x in range(69)})

In [39]:
vars

Unnamed: 0,Drive,Lane,Other,Place,Road,Trail,Way,Alpine,Apline,Dulzura,Jamul,Non-city,Potrero,Tecate,Infrastructure,Mobile Home Double Wide,Mobile Home Single Wide,Mobile Home Triple Wide,Motor Home,Multi Family Residence Single Story,School,Single Family Residence Multi Story,Single Family Residence Single Story,Utility Misc Structure,Multiple Residence,Nonresidential Commercial,Other Minor Structure,Single Residence,Concrete,Metal,Other.1,Tile,Unknown * Roof Construction,Wood,No Eaves,Unenclosed,Unknown * Eaves,"Mesh Screen > 1/8""""",No Vents,Unknown * Vent Screen,Unscreened,Ignition Resistant,Metal.1,Other.2,Stucco Brick Cement,Unknown * Exterior Siding,Vinyl,Wood.1,No Windows,Single Pane,Unknown * Window Pane,Masonry/Concrete,No Deck/Porch,Unknown * Deck/Porch On Grade,Wood.2,Masonry/Concrete.1,No Deck/Porch.1,Unknown * Deck/Porch Elevated,Wood.3,No Patio Cover/Carport,Non Combustible,Unknown * Patio Cover/Carport Attached to Structure,No Fence,Non Combustible.1,Unknown * Fence Attached to Structure,Year Built (parcel),Latitude,Longitude,Month
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.757157,-116.702696,9.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,32.757366,-116.703405,9.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.756605,-116.704037,9.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,32.757064,-116.703795,9.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.757078,-116.698577,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,32.613359,-116.695157,8.0
543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,32.622430,-116.690357,8.0
544,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1987.0,32.639811,-116.632883,8.0
545,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1988.0,32.637856,-116.635110,8.0


In [54]:
X_train, X_test, y_train, y_test = (
    train_test_split(vars, data["* Damage"], test_size=0.25, random_state=1)
)
single = pd.DataFrame([RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]).T.rename(index={0: "Control"})
single.index.names = ['Dropped']
single.columns.names = ['Test No.']
for c in cols:
    X_train, X_test, y_train, y_test = (
        train_test_split(vars[[c]], data["* Damage"], test_size=0.25, random_state=1)
    )
    single.loc[c] = [RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]

X_train, X_test, y_train, y_test = (
    train_test_split(vars, data["* Damage"], test_size=0.25, random_state=1)
)
dropped = pd.DataFrame([RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]).T.rename(index={0: "Control"})
dropped.index.names = ['Dropped']
dropped.columns.names = ['Test No.']
for c in cols:
    X_train, X_test, y_train, y_test = (
        train_test_split(vars.drop(c, axis=1), data["* Damage"], test_size=0.25, random_state=1)
    )
    dropped.loc[c] = [RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]

X_train, X_test, y_train, y_test = (
    train_test_split(vars, data["* Damage"], test_size=0.25, random_state=1)
)
two = pd.DataFrame([RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]).T.rename(index={0: "Control"})
two.index.names = ['Dropped']
two.columns.names = ['Test No.']
for c in cols:
    for d in cols:
        X_train, X_test, y_train, y_test = (
            train_test_split(vars[[c, d]], data["* Damage"], test_size=0.25, random_state=1)
        )
        two.loc[f"{c} & {d}"] = [RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]

In [None]:
X_train, X_test, y_train, y_test = (
    train_test_split(vars, data["* Damage"], test_size=0.25, random_state=1)
)
two = pd.DataFrame([RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]).T.rename(index={0: "Control"})
two.index.names = ['Dropped']
two.columns.names = ['Test No.']
for a in cols:
    for b in cols:
        for c in cols:
            for d in cols:
                for e in cols:
                    X_train, X_test, y_train, y_test = (
                        train_test_split(vars[[a, b, c, d, e]], data["* Damage"], test_size=0.25, random_state=1)
                    )
                    two.loc[f"{a}, {b}, {c}, {d}, {e}"] = [RandomForestClassifier().fit(X_train, y_train).score(X_test, y_test) for x in range(5)]

In [61]:
pd.options.display.max_rows = 1000
dropped.mean(axis=1).sort_values(ascending=True)

Dropped
Longitude                                              0.843796
Latitude                                               0.865693
Unknown * Roof Construction                            0.867153
Place                                                  0.874453
Metal                                                  0.875912
Month                                                  0.875912
Wood                                                   0.878832
Unknown * Patio Cover/Carport Attached to Structure    0.878832
Multi Family Residence Single Story                    0.878832
School                                                 0.880292
Unknown * Deck/Porch On Grade                          0.880292
Tecate                                                 0.881752
No Eaves                                               0.881752
Tile                                                   0.881752
Other                                                  0.883212
Unknown * Eaves                 

In [62]:
pd.options.display.max_rows = 1000
single.mean(axis=1).sort_values(ascending=False)

Dropped
Control                                                0.887591
Unknown * Exterior Siding                              0.722628
Month                                                  0.722628
Unknown * Roof Construction                            0.700730
Ignition Resistant                                     0.700730
Longitude                                              0.700730
Alpine                                                 0.671533
No Patio Cover/Carport                                 0.664234
Unknown * Fence Attached to Structure                  0.662774
No Deck/Porch                                          0.649635
Latitude                                               0.642336
Year Built (parcel)                                    0.637956
Tile                                                   0.635036
Non Combustible                                        0.635036
Place                                                  0.591241
Motor Home                      

In [65]:
dropped.mean(axis=1).sort_values(ascending=False)[:15].index.intersection(dropped.mean(axis=1).sort_values(ascending=False)[:15].index)

Index(['Non Combustible', 'Road', 'Control', 'Infrastructure',
       'Unknown * Window Pane', 'Unknown * Fence Attached to Structure',
       'No Fence', 'Nonresidential Commercial',
       'Single Family Residence Single Story', 'Mobile Home Triple Wide',
       'Other Minor Structure', 'Trail', 'No Vents', 'Masonry/Concrete',
       'Mesh Screen > 1/8""'],
      dtype='object', name='Dropped')

In [None]:
'Non Combustible', 'Road', 'Control', 'Infrastructure',
       'Unknown * Window Pane', 'Unknown * Fence Attached to Structure',
       'No Fence', 'Nonresidential Commercial',
       'Single Family Residence Single Story', 'Mobile Home Triple Wide',
       'Other Minor Structure', 'Trail', 'No Vents', 'Masonry/Concrete',
       'Mesh Screen > 1/8""'

In [6]:
pd.options.display.max_columns = 1000
vars

Unnamed: 0,Drive,Lane,Other,Place,Road,Trail,Way,Alpine,Apline,Dulzura,Jamul,Non-city,Potrero,Tecate,Infrastructure,Mobile Home Double Wide,Mobile Home Single Wide,Mobile Home Triple Wide,Motor Home,Multi Family Residence Single Story,School,Single Family Residence Multi Story,Single Family Residence Single Story,Utility Misc Structure,Multiple Residence,Nonresidential Commercial,Other Minor Structure,Single Residence,Concrete,Metal,Other.1,Tile,Unknown,Wood,No Eaves,Unenclosed,Unknown.1,"Mesh Screen > 1/8""""",No Vents,Unknown.2,Unscreened,Ignition Resistant,Metal.1,Other.2,Stucco Brick Cement,Unknown.3,Vinyl,Wood.1,No Windows,Single Pane,Unknown.4,Masonry/Concrete,No Deck/Porch,Unknown.5,Wood.2,Masonry/Concrete.1,No Deck/Porch.1,Unknown.6,Wood.3,No Patio Cover/Carport,Non Combustible,Unknown.7,No Fence,Non Combustible.1,Unknown.8,Year Built (parcel),Latitude,Longitude,Month
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.757157,-116.702696,9.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,32.757366,-116.703405,9.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.756605,-116.704037,9.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,32.757064,-116.703795,9.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.757078,-116.698577,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,32.613359,-116.695157,8.0
543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,32.622430,-116.690357,8.0
544,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1987.0,32.639811,-116.632883,8.0
545,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1988.0,32.637856,-116.635110,8.0
