In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

fires = pd.read_csv("wildfires.csv").drop(["_id", "OBJECTID", "APN (parcel)", "x", "y"], axis=1).query("County == 'San Diego'")
fires["Incident Start Date"] = pd.to_datetime(fires["Incident Start Date"])
fires["Month"] = fires["Incident Start Date"].dt.month
fires["* City"] = fires["* City"].fillna("Non-city")

cat_cols = ["* Street Type (e.g. road, drive, lane, etc.)", "* City", "County", "* Structure Type", "Structure Category", "* Roof Construction", "* Eaves", "* Vent Screen", "* Exterior Siding", "* Window Pane", "* Deck/Porch On Grade", "* Deck/Porch Elevated", "* Patio Cover/Carport Attached to Structure", "* Fence Attached to Structure"]
num_cols = ["Year Built (parcel)", "Latitude", "Longitude", "Month"]

preproc = ColumnTransformer(
    transformers = [
        ("categorical", OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols),
        ("numerical", "passthrough", num_cols)
    ],
    remainder="drop"
)
preproc.fit_transform(fires)

data = fires[cat_cols + num_cols + ["* Damage"]].dropna()
X_train, X_test, y_train, y_test = (
    train_test_split(data[cat_cols + num_cols], data["* Damage"], test_size=0.25, random_state=1)
)
processing = Pipeline([("preproc", preproc), ("tree", RandomForestClassifier())])
processing.fit(X_train, y_train)
tree_score = processing.score(X_test, y_test) # 0.8832116788321168

  fires = pd.read_csv("wildfires.csv").drop(["_id", "OBJECTID", "APN (parcel)", "x", "y"], axis=1).query("County == 'San Diego'")
  fires["Incident Start Date"] = pd.to_datetime(fires["Incident Start Date"])
