# 2. Notebook : Dataset Balance

In [None]:
#%pip uninstall azure-ai-ml
#%pip install azure-ai-ml
#%pip show azure-ai-ml #(the version value needs to be 1.5.0 or later)
#%pip install -U mltable azureml-dataprep[pandas]
#%pip install plotly==5.17.0
#%pip install seaborn
#%pip install --upgrade nbformat
#%pip install tensorflow
#%conda install graphviz

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import numpy as np
from azureml.core import Workspace, Dataset
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
from datetime import datetime, timedelta
import matplotlib as mpl
import matplotlib.pyplot as plt
import random


##### Forest Fire Data

In [4]:
subscription_id = 'f0458613-2441-4d07-bf43-8d17d9e90a56'
resource_group = 'capstone-project'
workspace_name = 'ML_forestfireprediction'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='forestfire')
forest_fire_dataset=dataset.to_pandas_dataframe()

In [None]:
#forest_fire_dataset.head()

In [12]:
#forest_fire_dataset.info()
#forest_fire_dataset['SIZE_HA'].isnull().sum()


In [5]:
forest_fire_dataset['OUT_DATE'] = forest_fire_dataset['OUT_DATE'].fillna(forest_fire_dataset['REP_DATE'])
forest_fire_dataset['SIZE_HA']=forest_fire_dataset['SIZE_HA'].astype(str).astype(float)

##### Weather Station Info
To find closest station

In [6]:
dataset = Dataset.get_by_name(workspace, name='weatherforestfire')
forest_fire_station_point=dataset.to_pandas_dataframe()

Message: rslex failed, falling back to clex.
Payload: {"pid": 4694, "source": "azureml.dataprep", "version": "4.11.3", "trace": "azureml|data|tabular_dataset.py, line 169 in function <lambda>.\nazureml|data|dataset_error_handling.py, line 107 in function _try_execute.\nazureml|data|tabular_dataset.py, line 169 in function to_pandas_dataframe.", "subscription": "", "run_id": "", "resource_group": "", "workspace_name": "", "experiment_id": "", "location": "", "rslex_version": "2.18.3"}
thread 'tokio-runtime-worker' panicked at 'send should succeed: SendError { .. }', /src/dataprep/Core/rust_lex/rslex-http-stream/src/http_client/hyper_client/execution.rs:78:37
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


In [None]:
#forest_fire_station_point.info()

In [None]:
#forest_fire_station_point.head()

In [7]:
forest_fire_station_point['CLIMATE_IDENTIFIER']=forest_fire_station_point['CLIMATE_IDENTIFIER'].astype(float)

##### Distance Calculation

In [8]:
forest_fire_station_point['Euclideandist']  = np.sqrt((forest_fire_station_point['FOREST_FIRE_LATITUDE'] - forest_fire_station_point['WEATHER_STATION_LATITUDE'])**2 + (forest_fire_station_point['FOREST_FIRE_LONGITUDE'] - forest_fire_station_point['WEATHER_STATION_LONGITUDE'])**2)
forest_fire_station_point['REP_DATE']=pd.to_datetime(forest_fire_station_point['REP_DATE'])
forest_fire_station_point['DLY_FIRST_DATE']=pd.to_datetime(forest_fire_station_point['DLY_FIRST_DATE'])
forest_fire_station_point['DLY_LAST_DATE']=pd.to_datetime(forest_fire_station_point['DLY_LAST_DATE'])
unique_fid=forest_fire_station_point['FID'].unique()
mapped_df=pd.DataFrame()

###### Assign forest fire days weather data

In [9]:
for fid in unique_fid:
    x=forest_fire_station_point.loc[forest_fire_station_point['FID'] == fid]
    x= x.drop(x[x.REP_DATE > x.DLY_LAST_DATE].index)
    x= x.drop(x[x.REP_DATE < x.DLY_FIRST_DATE].index)
    mapped_df = pd.concat([mapped_df, x.loc[x['Euclideandist'] == x['Euclideandist'].min()]], ignore_index=True)

In [None]:
#mapped_df.sample(n=5)

In [13]:
mapped_df = mapped_df.rename(columns={'CLIMATE_IDENTIFIER': 'ClimateID','REP_DATE': 'Date'})
mapped_df['OUT_DATE']=pd.to_datetime(mapped_df['OUT_DATE'],format='%Y-%m-%d')
mapped_df=mapped_df.drop(columns=['DLY_FIRST_DATE','DLY_LAST_DATE'],axis=1)

##### Weather Data

In [14]:
dataset = Dataset.get_by_name(workspace, name='weatherdata')
weather=dataset.to_pandas_dataframe()

In [15]:
weather_df=weather.copy()
weather_df['CLIMATE_IDENTIFIER']=weather_df['CLIMATE_IDENTIFIER'].astype(float)
#weather_df.info()

###### Data Pre-Process

1. Remove trailing spaces in object dtype

In [16]:
cols = weather_df.select_dtypes(['object']).columns
weather_df[cols] = weather_df[cols].apply(lambda x: x.str.strip())

2. Replace None/M values in columns

In [17]:
def replaceString(column):
    weather_df[column] = weather_df[column].fillna(value=np.nan)


replaceString('MEAN_TEMPERATURE')
replaceString('MAX_REL_HUMIDITY')
replaceString('COOLING_DEGREE_DAYS')
replaceString('MIN_REL_HUMIDITY')
replaceString('TOTAL_PRECIPITATION')
replaceString('MAX_TEMPERATURE')
replaceString('SNOW_ON_GROUND')
replaceString('TOTAL_SNOW')
replaceString('HEATING_DEGREE_DAYS')
replaceString('DIRECTION_MAX_GUST')
replaceString('MIN_TEMPERATURE')
replaceString('SPEED_MAX_GUST')
replaceString('TOTAL_RAIN')


3. Fill with previous value or with '0' for early 60's

In [18]:
weather_df=weather_df.fillna(method='ffill')
weather_df = weather_df.fillna(0)

In [None]:
#weather_df.isnull().any()

In [19]:
weather_df = weather_df.rename(columns={'CLIMATE_IDENTIFIER': 'ClimateID','LOCAL_DATE':'Date'})
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

##### Merge Datasets

In [None]:
#weather_df.info()

In [20]:
merged_df = pd.merge(mapped_df, weather_df, on=['ClimateID','Date'], how='inner')

In [None]:
#merged_df[['FID','FIRE_ID','FOREST_FIRE_LATITUDE','FOREST_FIRE_LONGITUDE','Date','OUT_DATE','SIZE_HA','YEAR','MONTH','DAY','ClimateID','WEATHER_STATION_LATITUDE','WEATHER_STATION_LONGITUDE','MEAN_TEMPERATURE',	'STATION_NAME',
#'MAX_REL_HUMIDITY',	'COOLING_DEGREE_DAYS','MIN_REL_HUMIDITY','TOTAL_SNOW','TOTAL_PRECIPITATION']].sample(n=7)

##### Merge Data Validation

In [None]:
#mapped_df.loc[mapped_df['FID']==221761]

In [None]:
#mapped_df.loc[mapped_df['FID']==221779]

<b> As seen the data for a given Fire ID is associated with weather data for that day captured by the weather station.</b>

##### Balance Dataset

In [21]:
merged_df['FIRE_EVENT']=1
#merged_df.head(3)

##### Include non - fire events weather data

In [23]:
length=len(merged_df)
#n_days=10

In [28]:
balanced_dataset_tmp=pd.DataFrame()
for i in range(length):
        if(merged_df["YEAR"].values[i]>1960):
                latestDate = pd.to_datetime(merged_df["Date"].values[i])
                n_days_ago = (latestDate) - timedelta(days=random.randint(1,15))
                climate_id = merged_df['ClimateID'].values[i]
                temp_df= weather_df[weather_df['ClimateID']==climate_id]
                df=temp_df[temp_df['Date'].between(n_days_ago, latestDate)]
                df['FID']=merged_df['FID'].values[i]
                df['FIRE_ID']=merged_df['FIRE_ID'].values[i]
                df['FOREST_FIRE_LATITUDE']=merged_df['FOREST_FIRE_LATITUDE'].values[i]
                df['FOREST_FIRE_LONGITUDE']=merged_df['FOREST_FIRE_LONGITUDE'].values[i]
                df['WEATHER_STATION_LATITUDE']=merged_df['WEATHER_STATION_LATITUDE'].values[i]
                df['WEATHER_STATION_LONGITUDE']=merged_df['WEATHER_STATION_LONGITUDE'].values[i]
                df['OUT_DATE']=pd.to_datetime('1800-12-31',format='%Y-%m-%d')
                df['SIZE_HA']=random.randint(-1, 1)
                df['YEAR']=merged_df['YEAR'].values[i]
                df['MONTH']=merged_df['MONTH'].values[i]
                df['DAY']=merged_df['DAY'].values[i]
                df['Euclideandist']=-1
                df['FIRE_EVENT']=0
                balanced_dataset_tmp = pd.concat([balanced_dataset_tmp,df], ignore_index=True)

In [29]:
balanced_dataset=pd.concat([merged_df,balanced_dataset_tmp], ignore_index=True)

In [None]:
#balanced_dataset.head()

In [30]:
balanced_dataset['OUT_DATE'] = balanced_dataset['OUT_DATE'].fillna(balanced_dataset['Date'])
balanced_dataset['LOCAL_OUT_DAY'] = balanced_dataset['OUT_DATE'].dt.day
balanced_dataset['LOCAL_OUT_MONTH'] = balanced_dataset['OUT_DATE'].dt.month
balanced_dataset['LOCAL_OUT_YEAR'] = balanced_dataset['OUT_DATE'].dt.year

In [34]:
balanced_dataset.to_csv("dataset_final.csv", header=balanced_dataset.columns)


#### Dataset was loaded to csv because the space on this Notebook was exceeded.