# Análise dos dados de queimadas no estados brasileiros

O objetivo deste trabalho consiste em estimar a quantidade de focos de incêndio nos estados brasileiros em anos futuros, considerando os dados dos anos atual e anteriores

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# from matplotlib import rcParams
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# from sklearn import metrics as m
# from sklearn.metrics import accuracy_score 
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from enum import Enum

#Teste
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

In [2]:
class DataSourceEnum(Enum):
    GOOGLE_DRIVE = 'google drive'
    
class DataSourceNotFoundException(Exception):
    def __init__(self):
        super().__init__("Fonte para buscar dados é inválida.")
    
    
    
    
def fetchCSV(url: str, source: DataSourceEnum, encoding: str = 'UTF-8'):
    if source == DataSourceEnum.GOOGLE_DRIVE:
        baseUrl = "https://drive.google.com/uc?id="
        csvData = f"{baseUrl}{url.split('/')[-2]}"
    else:
        raise DataSourceNotFoundException()
        
    if encoding:
        return pd.read_csv(csvData,encoding=encoding)
    return pd.read_csv(csvData)


def cleanData(dataframe, inplace = False):
    return dataframe.dropna(inplace=inplace).drop_duplicates(inplace=inplace)        

class DateMonthEnum(Enum):
    JAN = "Janeiro"
    FEB = "Fevereiro"
    MAR = "Março"
    APR = "Abril"
    MAY = "Maio"
    JUN = "Junho"
    JUL = "Julho"
    AUG = "Agosto"
    SEP = "Setembro"
    OCT = "Outubro"
    NOV = "Novembro"
    DEC = "Dezembro"

    def getMonths():
        return DateMonthEnum.__members__.values()

numericDateMonth = {
    DateMonthEnum.JAN: 1,
    DateMonthEnum.FEB: 2,
    DateMonthEnum.MAR: 3,
    DateMonthEnum.APR: 4,
    DateMonthEnum.MAY: 5,
    DateMonthEnum.JUN: 6,
    DateMonthEnum.JUL: 7,
    DateMonthEnum.AUG: 8,
    DateMonthEnum.SEP: 9,
    DateMonthEnum.OCT: 10,
    DateMonthEnum.NOV: 11,
    DateMonthEnum.DEC: 12,
}
    
def altermonth(dataframe):
    for month in DateMonthEnum.getMonths():
        dataframe.month[dataframe.month==month.value] = numericDateMonth[month]
        
    return dataframe

   
def mergeDataBases(db1, db2):
    '''
        Combina duas bases de dados em uma.
    '''
    pass

def selection_variables_train_test(x,y):
    xTrain, xTest, yTrain, yTest = train_test_split(X, y)
    return xTrain, xTest, yTrain, yTest


def model_regression_linear(xTrain,yTrain):
    #regr_linear = LinearRegression().fit(xTrain, yTrain)
    regr_linear = RandomForestRegressor().fit(xTrain,yTrain)
    return regr_linear
   
def model_neural_network(xTrain,yTrain):
    classifier_RN =  MLPRegressor()
    classifier_RN.fit(xTrain, yTrain)
    return classifier_RN

def model_Elastic_Net(xTrain,yTrain):
    classifier_EN =  ElasticNet()
    classifier_EN.fit(xTrain, yTrain)
    return classifier_EN

def model_lasso(xTrain,yTrain):
    classifier_lasso =  Lasso()
    classifier_lasso.fit(xTrain, yTrain)
    return classifier_lasso

def fit_model(classifier,xTrain,yTrain):
    classifier.fit(xTrain,yTrain)
    return classifier
#fit_model(LinearRegression(),XTrain,yTrain)

def plot_graph(dataframe,Titulo:"Gráfico"):
    
    return dataframe.plot(figsize=(10, 6)).set_title(Titulo, fontsize = 15)

def cross_validation(x_axis,y_axis):
  kfold  = KFold(n_splits=10, shuffle=True) # shuffle=True, Shuffle (embaralhar) the data.

  # Axis
  x = x_axis
  y = y_axis

  # Models instances.
  linearRegression = LinearRegression()
  elasticNet       = ElasticNet()
  ridge            = Ridge()
  lasso            = Lasso()
  NeuralRegression = MLPRegressor()

  # Applyes KFold to models.
  linearRegression_result = cross_val_score(linearRegression, x, y, cv = kfold)
  elasticNet_result       = cross_val_score(elasticNet, x, y, cv = kfold)
  ridge_result            = cross_val_score(ridge, x, y, cv = kfold)
  lasso_result            = cross_val_score(lasso, x, y, cv = kfold)
  NeuralRegression_result = cross_val_score(NeuralRegression, x, y, cv = kfold)

  # Creates a dictionary to store Linear Models.
  dic_models = {
    "LinearRegression": linearRegression_result.mean(),
    "ElasticNet": elasticNet_result.mean(),
    "Ridge": ridge_result.mean(),
    "Lasso": lasso_result.mean(),
    "NeuralNetwork":  NeuralRegression_result.mean(),
  }
  # Select the best model.
  bestModel = max(dic_models, key=dic_models.get)

  print("Linear Regression Média (R^2): {0}\nElastic Net Média (R^2): {1}\nRidge Média (R^2): {2}\nLasso Média (R^2): {3}\nNeural (R^2): {4}".format(linearRegression_result.mean(), elasticNet_result.mean(), ridge_result.mean(), lasso_result.mean(),NeuralRegression_result.mean()))
  print("O melhor modelo é: {0} com Valor: {1}".format(bestModel, dic_models[bestModel]))








In [3]:
#Dados queimadas - Amazônia Legal
forestFiresAM = fetchCSV(url='https://drive.google.com/file/d/1vRVra-yLlZxknweY-6YIQQJjWCqK75mT/view?usp=sharing', source=DataSourceEnum.GOOGLE_DRIVE)
forestFiresAM = cleanData(forestFiresAM)

#Dados queimadas - Brasil
forestFires = fetchCSV(url='https://drive.google.com/file/d/1VdFxMxEWzUuqYLTfv2grmFD2tDRdOez-/view?usp=sharing', source=DataSourceEnum.GOOGLE_DRIVE, encoding='latin-1')
forestFires = cleanData(forestFires)
forestFires



Unnamed: 0,year,state,month,number,date
0,1998,Acre,Janeiro,0.0,1998-01-01
1,1999,Acre,Janeiro,0.0,1999-01-01
2,2000,Acre,Janeiro,0.0,2000-01-01
3,2001,Acre,Janeiro,0.0,2001-01-01
4,2002,Acre,Janeiro,0.0,2002-01-01
...,...,...,...,...,...
6449,2012,Tocantins,Dezembro,128.0,2012-01-01
6450,2013,Tocantins,Dezembro,85.0,2013-01-01
6451,2014,Tocantins,Dezembro,223.0,2014-01-01
6452,2015,Tocantins,Dezembro,373.0,2015-01-01


In [4]:
#Ajustar meses 
forestFires = altermonth(forestFires)
forestFires

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,year,state,month,number,date
0,1998,Acre,1,0.0,1998-01-01
1,1999,Acre,1,0.0,1999-01-01
2,2000,Acre,1,0.0,2000-01-01
3,2001,Acre,1,0.0,2001-01-01
4,2002,Acre,1,0.0,2002-01-01
...,...,...,...,...,...
6449,2012,Tocantins,12,128.0,2012-01-01
6450,2013,Tocantins,12,85.0,2013-01-01
6451,2014,Tocantins,12,223.0,2014-01-01
6452,2015,Tocantins,12,373.0,2015-01-01


In [5]:
dados_estados_binarios = pd.get_dummies(forestFires['state'])
dados_estados_binarios.head()
vars_categoricas = ['state']

df2 = forestFires 

for i in vars_categoricas:
  dados_estados_binarios = pd.get_dummies(forestFires[i])
  df2 = df2.drop(i,axis = 1)
  df2=df2.join(dados_estados_binarios)

df2 = df2.loc[df2['year']>=2000]
columns_df2 = ['year', 'number'
               , 'Acre', 'Alagoas', 'Amapa',
       'Amazonas', 'Bahia', 'Ceara', 'Distrito Federal', 'Espirito Santo',
       'Goias', 'Maranhao', 'Mato Grosso', 'Minas Gerais', 'Paraiba', 'Pará',
       'Pernambuco', 'Piau', 'Rio', 'Rondonia', 'Roraima', 'Santa Catarina',
       'Sao Paulo', 'Sergipe', 'Tocantins']
df2 = df2[columns_df2]

df2 = df2.groupby(['year'
               , 'Acre', 'Alagoas', 'Amapa',
       'Amazonas', 'Bahia', 'Ceara', 'Distrito Federal', 'Espirito Santo',
       'Goias', 'Maranhao', 'Mato Grosso', 'Minas Gerais', 'Paraiba', 'Pará',
       'Pernambuco', 'Piau', 'Rio', 'Rondonia', 'Roraima', 'Santa Catarina',
       'Sao Paulo', 'Sergipe', 'Tocantins']).sum().reset_index()


In [6]:
# TESTE : construindo o regressor com todas as variáveis disponíveis:

target = "number"

y = df2[target]
X = df2.drop([target],axis=1)


xTrain, xTest, yTrain, yTest = selection_variables_train_test(X,y)
regr_linear = model_regression_linear(xTrain, yTrain)

yPred = regr_linear.predict(xTest)
prediction_relative_error = [100*abs(pair[0] - pair[1])/pair[1] for pair in list(zip(yPred,yTest))]

df_regressao_linear = xTest.copy()
df_regressao_linear['prediction'] = yPred
df_regressao_linear['relative_error'] = prediction_relative_error
df_regressao_linear['Focos de incêndio'] = yTest
df_regressao_linear

Unnamed: 0,year,Acre,Alagoas,Amapa,Amazonas,Bahia,Ceara,Distrito Federal,Espirito Santo,Goias,...,Rio,Rondonia,Roraima,Santa Catarina,Sao Paulo,Sergipe,Tocantins,prediction,relative_error,Focos de incêndio
364,2015,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2029.07430,70.511305,1189.994
82,2003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1443.92246,50.026959,962.442
112,2004,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1369.71403,39.419990,2261.000
224,2009,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1433.69808,54.055067,930.640
194,2008,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2850.14000,68.747188,1689.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,2004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1302.20868,8.772552,1197.185
186,2008,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1702.75923,42.729189,1193.000
282,2012,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2382.02111,7.346603,2219.000
338,2014,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,158.03000,37.039841,251.000


In [None]:
cross_validation(X,y)

In [None]:
df_regressao_linear = df_regressao_linear.sort_values(['year'])
df_regressao_linear.set_index(['year'],inplace = True)

In [None]:
plot_graph(df_regressao_linear[['prediction','Focos de incêndio']],"Predição vs Queimadas - Regressão Linear")

In [None]:
xTrain_neural, xTest_neural, yTrain_neural, yTest_neural = selection_variables_train_test(X,y)
classifier_RN = model_regression_linear(xTrain_neural, yTrain_neural)

yPred_neural = classifier_RN.predict(xTest_neural)

In [None]:
prediction_relative_error_neural = [100*abs(pair[0] - pair[1])/pair[1] for pair in list(zip(yPred_neural,yTest_neural))]
df_rede_neural = xTest_neural.copy()
df_rede_neural['prediction'] = yPred_neural
df_rede_neural['relative_error'] = prediction_relative_error_neural
df_rede_neural['Focos de incêndio'] = yTest_neural
df_rede_neural

In [None]:
df_rede_neural = df_rede_neural.sort_values(['year'])
df_rede_neural.set_index(['year'],inplace = True)

In [None]:
plot_graph(df_rede_neural[['prediction','Focos de incêndio']],"Predição vs Queimadas - Rede Neural")

In [None]:
##Elastic Net
xTrain_elastic, xTest_elastic, yTrain_elastic, yTest_elastic =  selection_variables_train_test(X,y)
classifier_EN = model_Elastic_Net(xTrain_elastic,yTrain_elastic)
yPred_elastic = classifier_EN.predict(xTest_elastic)

In [None]:
prediction_relative_error_elastic = [100*abs(pair[0] - pair[1])/pair[1] for pair in list(zip(yPred_elastic,yTest_elastic))]
df_en = xTest_elastic.copy()
df_en['prediction'] = yPred_elastic
df_en['relative_error'] = prediction_relative_error_elastic
df_en['Focos de incêndio'] = yTest_elastic
df_en

In [None]:
df_en = df_en.sort_values(['year'])
df_en.set_index(['year'],inplace = True)

In [None]:
plot_graph(df_en[['prediction','Focos de incêndio']],"Predição vs Queimadas - Elastic Net")

In [None]:
#lASSO 
xTrain_lasso, xTest_lasso, yTrain_lasso, yTest_lasso =  selection_variables_train_test(X,y)
classifier_lasso = model_lasso(xTrain_lasso,yTrain_lasso)
yPred_lasso = classifier_lasso.predict(xTest_lasso)

In [None]:
prediction_relative_error_lasso = [100*abs(pair[0] - pair[1])/pair[1] for pair in list(zip(yPred_lasso,yTest_lasso))]
df_lasso = xTest_lasso.copy()
df_lasso['prediction'] = yPred_lasso
df_lasso['relative_error'] = prediction_relative_error_lasso
df_lasso['Focos de incêndio'] = yTest_lasso
df_lasso

In [None]:
df_lasso = df_lasso.sort_values(['year'])
df_lasso.set_index(['year'],inplace = True)

In [None]:
plot_graph(df_lasso[['prediction','Focos de incêndio']],"Predição vs Queimadas - Lasso")