In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score

In [6]:
%load_ext kedro.ipython

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [7]:
table = catalog.load('input_table')

In [4]:
def extract_sample(df, sample_size):
    # Calculate the fraction of the data to sample
    fraction = sample_size / len(df)

    # Extract a representative sample
    sample = df.sample(frac=fraction, random_state=42)  # Setting random_state for reproducibility

    return sample

sample_size = 10000
table_sample = extract_sample(table, sample_size)
table_sample.head()

In [None]:
def get_mode(series):
    mode = series.mode()
    if not mode.empty:
        return mode.iloc[0]
    else:
        return None  # or a default value like 'unknown'
    
mode = get_mode(table_sample['time_of_day'])

In [None]:
# Convert CREATION_DATE to datetime format
table_sample['CREATION_DATE'] = pd.to_datetime(table_sample['CREATION_DATE'])

# Add columns for year and month
table_sample['year'] = table_sample['CREATION_DATE'].dt.year

# Define categorical and numerical features
categorical_features = ['USE_CATEGORY', 'USE_CODE', 'YEAR_CONSTRUCTION', 'time_of_day', 'day_of_week']
numerical_features = ['grid_lat', 'grid_long', 'distance_to_fire_station', 'ABOVE_GROUND_FLOORS', 'AREA_BUILDING', 'AREA_LAND', 'building_age', 'year', 'month', 'time_of_day', 'day_of_week']

# Aggregate the data by grid_lat, grid_long, year, and month
aggregated_data = table_sample.groupby(['grid_lat', 'grid_long', 'year', 'month']).agg({
    'USE_CATEGORY': get_mode,
    'USE_CODE': get_mode,
    'YEAR_CONSTRUCTION': 'mean',
    'is_fire': 'sum',  # Sum to count the number of fire incidents
    'ABOVE_GROUND_FLOORS': 'mean',
    'AREA_BUILDING': 'mean',
    'AREA_LAND': 'mean',
    'building_age': 'mean',
    'time_of_day': get_mode,
    'day_of_week': get_mode,
    'month': get_mode,
    'year': get_mode,
    'distance_to_fire_station': 'mean',

}).reset_index()

# Create target labels indicating whether a fire occurred in the next month
aggregated_data['is_fire_next_month'] = aggregated_data.groupby(['grid_lat', 'grid_long'])['is_fire'].shift(-1).fillna(0).astype(int)

# Fill missing categorical values with 'missing' and numerical with the mean
for col in categorical_features:
    aggregated_data[col].fillna('missing', inplace=True)

for col in numerical_features:
    aggregated_data[col].fillna(aggregated_data[col].mean(), inplace=True)

# Ensure all data types are correctly set for the pipeline
for col in categorical_features:
    aggregated_data[col] = aggregated_data[col].astype(str)

# Drop rows with missing target
aggregated_data.dropna(subset=['is_fire_next_month'], inplace=True)

In [None]:
# Prepare the data
features = categorical_features + numerical_features + ['year', 'month']
X = aggregated_data[features]
y = aggregated_data['is_fire_next_month']

# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features + ['year', 'month']),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessing and Random Forest model
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])


In [None]:
# Split the data into training and testing sets using TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the Random Forest model
    pipeline_rf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline_rf.predict(X_test)
    y_prob = pipeline_rf.predict_proba(X_test)[:, 1]

    # Print out some evaluation metrics
    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

In [None]:
# Extract feature importance from the Random Forest model
feature_importances = pipeline_rf.named_steps['regressor'].feature_importances_

# Get the feature names from the preprocessing pipeline
transformed_feature_names = pipeline_rf.named_steps['preprocessor'].get_feature_names_out()

# Debug: Print lengths of feature_names and feature_importances
print(f"Length of transformed feature names: {len(transformed_feature_names)}")
print(f"Length of feature importances: {len(feature_importances)}")

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

# Aggregate importance for categorical features
cat_feature_importance = {}
for col in categorical_features:
    cat_cols = [col_name for col_name in transformed_feature_names if col in col_name]
    total_importance = feature_importance_df[feature_importance_df['feature'].isin(cat_cols)]['importance'].sum()
    cat_feature_importance[col] = total_importance

# Aggregate importance for numerical features
num_feature_importance = {
    col: feature_importance_df[feature_importance_df['feature'] == col]['importance'].values[0]
    for col in numerical_features + ['year', 'month']
    if not feature_importance_df[feature_importance_df['feature'] == col].empty
}

# Combine categorical and numerical importances
all_feature_importances = {**cat_feature_importance, **num_feature_importance}

# Create a DataFrame for grouped feature importances
grouped_feature_importance = pd.DataFrame({
    'feature': list(all_feature_importances.keys()),
    'importance': list(all_feature_importances.values())
}).sort_values(by='importance', ascending=False)

# Plot grouped feature importances
plt.figure(figsize=(10, 8))
plt.barh(grouped_feature_importance['feature'], grouped_feature_importance['importance'])
plt.xlabel('Feature Importance')
plt.title('Grouped Feature Importance in Random Forest Model')
plt.gca().invert_yaxis()
plt.show()

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score

def get_mode(series):
    mode = series.mode()
    if not mode.empty:
        return mode.iloc[0]
    else:
        return None  # or a default value like 'unknown'

# Group by grid location and month
aggregated_data = table.groupby(['grid_lat', 'grid_long', 'year', 'month']).agg({
    'USE_CATEGORY': get_mode,
    'YEAR_CONSTRUCTION': 'mean',
    'is_fire': 'sum',  # Sum to count number of fire incidents
    'ABOVE_GROUND_FLOORS': 'mean',
    'AREA_BUILDING': 'mean',
    'AREA_LAND': 'mean',
    'HOUSING_UNITS': 'mean',
    'distance_to_fire_station': 'mean',
    'building_age': 'mean',
    'AVERAGE_FAMILY_SIZE': 'mean',
    'POPULATION_DENSITY': 'mean',
    '2021_POPULATION': 'mean'
}).reset_index()

# Fill missing categorical values with 'missing' and numerical with the mean
categorical_features = ['USE_CATEGORY']
numerical_features = ['grid_lat', 'grid_long', 'ABOVE_GROUND_FLOORS', 'AREA_BUILDING', 'AREA_LAND', 'YEAR_CONSTRUCTION', 'HOUSING_UNITS', 'building_age', 'distance_to_fire_station', 'AVERAGE_FAMILY_SIZE', 'POPULATION_DENSITY', '2021_POPULATION']

for col in categorical_features:
    aggregated_data[col].fillna('missing', inplace=True)

for col in numerical_features:
    aggregated_data[col].fillna(aggregated_data[col].mean(), inplace=True)

# Create lag features for previous months
for lag in range(1, 4):
    lagged_features = aggregated_data.copy()
    lagged_features['month'] += lag
    lagged_features.columns = [col + f'_lag{lag}' if col not in ['grid_lat', 'grid_long', 'year', 'month'] else col for col in lagged_features.columns]
    aggregated_data = pd.merge(aggregated_data, lagged_features, on=['grid_lat', 'grid_long', 'year', 'month'], how='left')

# Ensure all data types are correctly set for the pipeline
for col in categorical_features:
    aggregated_data[col] = aggregated_data[col].astype(str)

# Drop rows with missing target
aggregated_data.dropna(subset=['is_fire'], inplace=True)

# Convert 'is_fire' to a binary numerical column for classification purposes (e.g., threshold at 0.5)
aggregated_data['is_fire'] = (aggregated_data['is_fire'] > 0.5).astype(int)

# Prepare the data
features = [col for col in aggregated_data.columns if col not in ['is_fire', 'year', 'month']]
X = aggregated_data[features]
y = aggregated_data['is_fire']

# Create transformers for numerical and categorical features
numerical_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with SMOTE, preprocessing, and Random Forest model
pipeline_rf = imbpipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('regressor', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Time series split
tscv = TimeSeriesSplit(n_splits=5)
roc_auc_scores = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the Random Forest model
    pipeline_rf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline_rf.predict(X_test)
    y_prob = pipeline_rf.predict_proba(X_test)[:, 1]

    # Print out some evaluation metrics
    print(classification_report(y_test, y_pred))
    roc_auc = roc_auc_score(y_test, y_prob)
    roc_auc_scores.append(roc_auc)
    print("ROC AUC Score:", roc_auc)

print("Average ROC AUC Score:", np.mean(roc_auc_scores))


              precision    recall  f1-score   support

           0       0.90      0.55      0.68     88056
           1       0.14      0.53      0.22     11744

    accuracy                           0.55     99800
   macro avg       0.52      0.54      0.45     99800
weighted avg       0.81      0.55      0.63     99800

ROC AUC Score: 0.5574871639113913


              precision    recall  f1-score   support

           0       0.87      0.77      0.82     86864
           1       0.13      0.23      0.17     12936

    accuracy                           0.70     99800
   macro avg       0.50      0.50      0.49     99800
weighted avg       0.78      0.70      0.74     99800

ROC AUC Score: 0.5165537655527138


              precision    recall  f1-score   support

           0       0.86      0.53      0.65     83461
           1       0.19      0.56      0.28     16339

    accuracy                           0.53     99800
   macro avg       0.52      0.54      0.47     99800
weighted avg       0.75      0.53      0.59     99800

ROC AUC Score: 0.5562393438651337


              precision    recall  f1-score   support

           0       0.87      0.54      0.66     86451
           1       0.14      0.47      0.21     13349

    accuracy                           0.53     99800
   macro avg       0.50      0.51      0.44     99800
weighted avg       0.77      0.53      0.60     99800

ROC AUC Score: 0.5087010300634894


              precision    recall  f1-score   support

           0       0.87      0.59      0.70     86951
           1       0.12      0.39      0.19     12849

    accuracy                           0.56     99800
   macro avg       0.49      0.49      0.44     99800
weighted avg       0.77      0.56      0.64     99800

ROC AUC Score: 0.4877901645151229
Average ROC AUC Score: 0.5253542935815702


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score

def get_mode(series):
    mode = series.mode()
    if not mode.empty:
        return mode.iloc[0]
    else:
        return None  # or a default value like 'unknown'

# Group by grid location and month
aggregated_data = table.groupby(['grid_lat', 'grid_long', 'year', 'month']).agg({
    'USE_CATEGORY': get_mode,
    'YEAR_CONSTRUCTION': 'mean',
    'is_fire': 'sum',  # Sum to count number of fire incidents
    'ABOVE_GROUND_FLOORS': 'mean',
    'AREA_BUILDING': 'mean',
    'AREA_LAND': 'mean',
    'HOUSING_UNITS': 'mean',
    'distance_to_fire_station': 'mean',
    'building_age': 'mean',
    'AVERAGE_FAMILY_SIZE': 'mean',
    'POPULATION_DENSITY': 'mean',
    '2021_POPULATION': 'mean'
}).reset_index()

# Fill missing categorical values with 'missing' and numerical with the mean
categorical_features = ['USE_CATEGORY']
numerical_features = ['grid_lat', 'grid_long', 'ABOVE_GROUND_FLOORS', 'AREA_BUILDING', 'AREA_LAND', 'YEAR_CONSTRUCTION', 'HOUSING_UNITS', 'building_age', 'distance_to_fire_station', 'AVERAGE_FAMILY_SIZE', 'POPULATION_DENSITY', '2021_POPULATION']

for col in categorical_features:
    aggregated_data[col].fillna('missing', inplace=True)

for col in numerical_features:
    aggregated_data[col].fillna(aggregated_data[col].mean(), inplace=True)

# Create lag features for previous months
for lag in range(1, 4):
    lagged_features = aggregated_data.copy()
    lagged_features['month'] += lag
    lagged_features.columns = [col + f'_lag{lag}' if col not in ['grid_lat', 'grid_long', 'year', 'month'] else col for col in lagged_features.columns]
    aggregated_data = pd.merge(aggregated_data, lagged_features, on=['grid_lat', 'grid_long', 'year', 'month'], how='left')

# Ensure all data types are correctly set for the pipeline
for col in categorical_features:
    aggregated_data[col] = aggregated_data[col].astype(str)

# Drop rows with missing target
aggregated_data.dropna(subset=['is_fire'], inplace=True)

# Convert 'is_fire' to a binary numerical column for classification purposes (e.g., threshold at 0.5)
aggregated_data['is_fire'] = (aggregated_data['is_fire'] > 0.5).astype(int)

# Prepare the data
features = [col for col in aggregated_data.columns if col not in ['is_fire', 'year', 'month']]
X = aggregated_data[features]
y = aggregated_data['is_fire']

# Create transformers for numerical and categorical features
numerical_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with SMOTE, preprocessing, and Random Forest model
pipeline_rf = imbpipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('regressor', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Time series split
tscv = TimeSeriesSplit(n_splits=5)

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=tscv, scoring='roc_auc', n_jobs=-1)

# Fit the model
grid_search.fit(X, y)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best ROC AUC Score: ", grid_search.best_score_)

# Predict on the test set using the best model
best_model = grid_search.best_estimator_

# Print out some evaluation metrics using cross-validation results
results = grid_search.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"Mean ROC AUC Score: {mean_score:.4f} | Parameters: {params}")
