## <HR>Creation of Geographical Features for Wildfires Incidents<HR>

#### Import the Required Packages

In [1]:
import os
import sys
import json
import time
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import xarray as xr
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

In [2]:
DATA_PATH = "data/"

#### 'GeographicalFeaturesGenerator' class to extract the environmental features

In [3]:
class GeographicalFeaturesGenerator:
    """FeaturesGenerator to generate geographical features.
    
    It generates the features like land_type, district, federal_subject of fire incident area and 
    distance from forest, city. Also, number of forest/field/cities within the radius.
    
    Russian-cities Data Source:
    https://github.com/pensnarik/russian-cities
    
    """
    
    def __init__(self, wildfire_data, forest_data, field_data, land_data, nature_forest_data, cities_data):
        self.wildfire_data = wildfire_data
        self.forest_data = forest_data
        self.field_data = field_data 
        self.land_data = land_data 
        self.nature_forest_data = nature_forest_data
        self.cities_data = cities_data
    
    def _predict_label(self, df_train, df_test, label=None):    
        """predict the label(land_type,district..) for data points in df_test"""
        #train k-nearest neighbors classifier 
        neigh = KNeighborsClassifier(n_neighbors=5)
        X, y = df_train[['longitude', 'latitude']], df_train[label]
        neigh.fit(X, y)
        #predict the label for wildfire incidents
        pred_label = neigh.predict(df_test[['longitude', 'latitude']])
        return pred_label
    
    def _get_dst(self, df_train, df_test):
        """find the minimum distance from data points in df_test to nearest data point in df_train"""
        #train NearestNeighbors(Unsupervised learner)
        neigh = NearestNeighbors(1)
        neigh.fit(df_train[['longitude', 'latitude']])
        #find the K-neighbors of points in df_test
        distances, indices = neigh.kneighbors(df_test[['longitude', 'latitude']])
        return distances
    
    def _get_info_radius(self, df_train, df_test, radii, _type):
        """get number of forest/field/cities within the radius"""
        result = pd.DataFrame()
        #train NearestNeighbors(Unsupervised learner)
        neigh = NearestNeighbors()
        neigh.fit(df_train[['longitude', 'latitude']])
        #find
        for radius in radii:
            distances, indices = neigh.radius_neighbors(df_test[['longitude', 'latitude']], radius=radius)
            count = np.vectorize(len)(distances)
            has_type = np.where(count > 0, 1, 0)
            result['has_{0}_radius_{1}'.format(_type,radius)] = has_type
            result['num_{0}_radius_{1}'.format(_type,radius)] = count
        return result
    
    def _get_event_count_lastyear(self, wildfire_data):
        """ """
        radii = [1.0, 1.5, 2, 2.5]
        wildfire_data['year'] = wildfire_data.date.dt.year
        wildfire_data['month'] = wildfire_data.date.dt.month
        start_year = wildfire_data.year.min()
        end_year = wildfire_data.year.max()
        result = pd.DataFrame()
        for radius in radii:
            temp = pd.Series(np.zeros((wildfire_data.shape[0])))
            for cur_year, month in itertools.product(range(start_year+1,end_year+1),range(1,13)):
                prev_year = cur_year - 1
                mask_prev = (wildfire_data.year<=prev_year)&(wildfire_data.month==month)
                mask_cur = (wildfire_data.year==cur_year)&(wildfire_data.month==month)
                if sum(mask_prev)!=0 and sum(mask_cur)!=0:
                    #train
                    neigh = NearestNeighbors(radius=radius)
                    neigh.fit(wildfire_data[mask_prev][['longitude', 'latitude']])
                    #find
                    distances, indices = neigh.radius_neighbors(wildfire_data[mask_cur][['longitude', 'latitude']])
                    count = np.vectorize(len)(distances)
                    #
                    temp.loc[mask_cur] = count
            result['num_event_lastyear_radius_{0}'.format(radius)] = temp 
        return result

    def start(self):
        print("="*5,"Features Extraction has started","="*5)
        start = time.time()
        #predict land_type, federal_subject, district for fire incident area
        self.wildfire_data['land_type'] = self._predict_label(self.land_data, self.wildfire_data, label='land_type')
        self.wildfire_data['federal_subject'] =  self._predict_label(self.cities_data, self.wildfire_data, label='subject')
        self.wildfire_data['district'] =  self._predict_label(self.cities_data, self.wildfire_data, label='district')
        self.wildfire_data['population'] =  self._predict_label(self.cities_data, self.wildfire_data, label='population')
        #Get minimum distance from fire incident to nature forest, forest, field, city
        self.wildfire_data['nature_forest_dst'] = self._get_dst(self.nature_forest_data, self.wildfire_data)
        self.wildfire_data['forest_dst'] = self._get_dst(self.forest_data, self.wildfire_data)
        self.wildfire_data['field_dst'] = self._get_dst(self.field_data, self.wildfire_data)
        self.wildfire_data['city_dst'] = self._get_dst(self.cities_data, self.wildfire_data)
        #
        self.wildfire_data = pd.concat([self.wildfire_data, 
                                        self._get_info_radius(self.forest_data, self.wildfire_data, [0.2, 0.5, 1.0], 'forest')], axis=1)
        #
        self.wildfire_data = pd.concat([self.wildfire_data, 
                                        self._get_info_radius(self.field_data, self.wildfire_data, [0.2, 0.5, 1.0], 'field')], axis=1)
        #
        self.wildfire_data = pd.concat([self.wildfire_data, 
                                        self._get_info_radius(self.cities_data, self.wildfire_data, [5, 10, 15], 'cities')], axis=1)
        
        #
        self.wildfire_data = pd.concat([self.wildfire_data, 
                                        self._get_event_count_lastyear(self.wildfire_data)], axis=1)
        #
        #self.wildfire_data = pd.concat([self.wildfire_data, 
         #                               self._get_event_count_firetype(self.wildfire_data)], axis=1)

        self.wildfire_data.to_csv(DATA_PATH + 'geo_features.csv', index=False)
        end = time.time()
        print("\n","="*5,"Features Extraction has finished","="*5)
        print("Total time taken", end-start, " sec")

#### 'load_Dataset' method to load the data into the memory

In [4]:
def load_Dataset():
    #wildfire
    wildfire_data = pd.read_csv(DATA_PATH+'wildfires_train.csv', parse_dates=['date'])
    #forest
    forest_data = pd.read_csv(DATA_PATH+'additional/forest_coords.csv')
    #field
    field_data = pd.read_csv(DATA_PATH+'additional/field_coords.csv')
    field_data['field'] = field_data['field'].apply(lambda x: 2)
    #land data
    land_data = pd.concat([forest_data.rename({'forest': 'land_type'}, axis=1),
                           field_data.rename({'field': 'land_type'}, axis=1)]).reset_index(drop=True)
    #nature
    nature_forest_data = pd.read_csv(DATA_PATH+'additional/nature_forests.csv')
    #cities
    cities_data = pd.read_json(DATA_PATH+'additional/russian-cities.json')
    cities_data = pd.concat([pd.json_normalize(cities_data.coords), cities_data], axis=1)
    cities_data.drop('coords', axis=1, inplace=True)
    cities_data.rename(columns={"lat": "latitude", "lon": "longitude"}, inplace=True)
    cities_data['district'] = cities_data.district.astype('category').cat.codes
    cities_data['subject'] = cities_data.subject.astype('category').cat.codes
    return wildfire_data, forest_data, field_data, land_data, nature_forest_data, cities_data

#### Extract Geographical Features

In [5]:
generator = GeographicalFeaturesGenerator(*load_Dataset())
generator.start()

===== Features Extraction has started =====

 ===== Features Extraction has finished =====
Total time taken 83.87114191055298  sec


#### Missing Values in features data

In [12]:
df_geo_features = pd.read_csv(DATA_PATH + 'geo_features.csv')

In [13]:
missing_value = pd.DataFrame(df_geo_features.isnull().sum(axis=0), columns=['Count'])
missing_value['Pert'] = missing_value.Count*100/df_geo_features.shape[0]
missing_value.sort_values(by='Count', ascending=False, inplace=True)
missing_value[missing_value.Count>0]

Unnamed: 0,Count,Pert


In [11]:
#there is no missing values