In [1]:
from IPython.core.display import HTML
css_file = "style.css"
HTML(open(css_file, 'r').read())

In [92]:
#first let's import all necessery libraries for this analysis
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [106]:
#using pandas library and 'read_csv' function to read amazon csv file as file already formated for us from Kaggle
amazon_df=pd.read_csv('./amazon.csv', encoding='latin1')

#examining head of the dataset
amazon_df.head(10)

Unnamed: 0,year,state,month,number,date
0,1998,Acre,Janeiro,0.0,1998-01-01
1,1999,Acre,Janeiro,0.0,1999-01-01
2,2000,Acre,Janeiro,0.0,2000-01-01
3,2001,Acre,Janeiro,0.0,2001-01-01
4,2002,Acre,Janeiro,0.0,2002-01-01
5,2003,Acre,Janeiro,10.0,2003-01-01
6,2004,Acre,Janeiro,0.0,2004-01-01
7,2005,Acre,Janeiro,12.0,2005-01-01
8,2006,Acre,Janeiro,4.0,2006-01-01
9,2007,Acre,Janeiro,0.0,2007-01-01


In [107]:
amazon.dtypes

year        int64
state      object
month      object
number    float64
date       object
dtype: object

# Understanding Data
Going over features presented in the dataset for analysis. Explaining the process and the results

In [108]:
#checking the length of the dataset
len(amazon_df)

6454

In [97]:
#checking if there are any nulls we are dealing with (missing data)
amazon_df.isna().sum()

year      0
state     0
month     0
number    0
date      0
dtype: int64

In [109]:
amazon_df.isnull().sum()

#cheking unique values in the state column
amazon_df.state.unique()

array(['Acre', 'Alagoas', 'Amapa', 'Amazonas', 'Bahia', 'Ceara',
       'Distrito Federal', 'Espirito Santo', 'Goias', 'Maranhao',
       'Mato Grosso', 'Minas Gerais', 'Pará', 'Paraiba', 'Pernambuco',
       'Piau', 'Rio', 'Rondonia', 'Roraima', 'Santa Catarina',
       'Sao Paulo', 'Sergipe', 'Tocantins'], dtype=object)

In [113]:
#checking unique values in the month column
amazon_df.month.unique()

array(['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho',
       'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro'],
      dtype=object)

In [114]:
month_portugess = list(amazon_df.month.unique())
month_english = ["January", "Feburary", "March", "April", "May", "June", "July", "August",
                "September", "October", "November", "December"]
dict_month = dict(zip(month_portugess, month_english))
dict_month

{'Janeiro': 'January',
 'Fevereiro': 'Feburary',
 'Março': 'March',
 'Abril': 'April',
 'Maio': 'May',
 'Junho': 'June',
 'Julho': 'July',
 'Agosto': 'August',
 'Setembro': 'September',
 'Outubro': 'October',
 'Novembro': 'November',
 'Dezembro': 'December'}

In [115]:
amazon_df.month = amazon_df.month.map(dict_month)
amazon_df.month.unique()

array(['January', 'Feburary', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December'],
      dtype=object)

In [116]:
#cheking the numeric percentile distribution for the fires reported

amazon_df.number.describe()

count    6454.000000
mean      108.293163
std       190.812242
min         0.000000
25%         3.000000
50%        24.000000
75%       113.000000
max       998.000000
Name: number, dtype: float64

In [117]:
#chekcing how many fires were reported in 20 years

amazon_df.number.sum()

698924.073

In [118]:
#we are already given the year column, however for good practice we can also extract it from the date one
amazon_df['Year']=pd.DatetimeIndex(amazon_df['date']).year
#cheking unique years in new created column 
amazon_df.Year.unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], dtype=int64)

# Exploring and Visualizing Data
Exploring the data by analyzing its statistics and visualizing the values of features and correlations between different features. Explaining the process and the results

In [119]:
amazon_df

Unnamed: 0,year,state,month,number,date,Year
0,1998,Acre,January,0.0,1998-01-01,1998
1,1999,Acre,January,0.0,1999-01-01,1999
2,2000,Acre,January,0.0,2000-01-01,2000
3,2001,Acre,January,0.0,2001-01-01,2001
4,2002,Acre,January,0.0,2002-01-01,2002
...,...,...,...,...,...,...
6449,2012,Tocantins,December,128.0,2012-01-01,2012
6450,2013,Tocantins,December,85.0,2013-01-01,2013
6451,2014,Tocantins,December,223.0,2014-01-01,2014
6452,2015,Tocantins,December,373.0,2015-01-01,2015


In [120]:
#we are not going to be using old year column and date column as they serve no significant purpose anymore 
amazon_df.drop(columns=['date', 'year'], axis=1, inplace=True)

#changing order of columns for preffered format
amazon_df=amazon_df[['state','number','month','Year']]

#changing names of columns for preffered format
amazon_df.rename(columns={'state': 'State', 'number': 'Fire_Number', 'month': 'Month'}, inplace=True)

#checking changes made
amazon_df.head()

Unnamed: 0,State,Fire_Number,Month,Year
0,Acre,0.0,January,1998
1,Acre,0.0,January,1999
2,Acre,0.0,January,2000
3,Acre,0.0,January,2001
4,Acre,0.0,January,2002


First, it will be interesting to look at the trend of fires beings reported over 20 years. 

In [127]:
# creating an empty list, which will be populated later with amount of fires reported

time_plot_1_df = amazon_df.groupby("Year").sum()["Fire_Number"].round(0).reset_index()

In [128]:
time_plot_1_df.head(5)

Unnamed: 0,Year,Fire_Number
0,1998,20014.0
1,1999,26883.0
2,2000,27351.0
3,2001,29072.0
4,2002,37391.0


In [131]:
#using plotly Scatter 
time_plot_1=go.Figure(go.Scatter(x = time_plot_1_df.Year, y = time_plot_1_df.Fire_Number,
                                 mode='lines + markers', line={'color': 'red'}))

#layout changes
time_plot_1.update_layout(title='Brazil Fires per 1998-2017 Years',
                   xaxis_title='Year',
                   yaxis_title='Fires')

#showing the figure
time_plot_1.show()

In [132]:
amazon_df

Unnamed: 0,State,Fire_Number,Month,Year
0,Acre,0.0,January,1998
1,Acre,0.0,January,1999
2,Acre,0.0,January,2000
3,Acre,0.0,January,2001
4,Acre,0.0,January,2002
...,...,...,...,...
6449,Tocantins,128.0,December,2012
6450,Tocantins,85.0,December,2013
6451,Tocantins,223.0,December,2014
6452,Tocantins,373.0,December,2015


In [153]:
time_plot_2_df = pd.pivot_table(amazon_df, values = "Fire_Number", index = "Year", columns = "State", aggfunc = np.sum)

In [155]:
time_plot_2_df = pd.pivot_table(amazon_df, values = "Fire_Number", index = "Year", columns = "State", aggfunc = np.sum)
time_plot_2_df = time_plot_2_df.round(0)
time_plot_2_df.head(5)
time_plot_2_df = time_plot_2_df.reset_index().reset_index(drop=True)
time_plot_2_df


State,Year,Acre,Alagoas,Amapa,Amazonas,Bahia,Ceara,Distrito Federal,Espirito Santo,Goias,...,Pará,Pernambuco,Piau,Rio,Rondonia,Roraima,Santa Catarina,Sao Paulo,Sergipe,Tocantins
0,1998,730.0,86.0,278.0,946.0,1225.0,1612.0,103.0,218.0,750.0,...,716.0,767.0,1494.0,1149.0,916.0,21.0,59.0,3196.0,20.0,913.0
1,1999,333.0,172.0,101.0,1061.0,1198.0,1688.0,46.0,240.0,1910.0,...,2192.0,463.0,2015.0,1030.0,209.0,220.0,364.0,4249.0,93.0,608.0
2,2000,434.0,123.0,253.0,853.0,1379.0,2211.0,48.0,175.0,2516.0,...,794.0,691.0,1112.0,650.0,868.0,362.0,306.0,4128.0,17.0,994.0
3,2001,828.0,86.0,1301.0,1297.0,2428.0,1848.0,64.0,130.0,2066.0,...,835.0,1080.0,731.0,857.0,1004.0,1309.0,200.0,2926.0,24.0,2039.0
4,2002,1544.0,258.0,862.0,2852.0,2281.0,454.0,149.0,297.0,1619.0,...,1266.0,1909.0,1504.0,2945.0,1452.0,2224.0,1715.0,3539.0,208.0,1919.0
5,2003,947.0,299.0,1652.0,1524.0,3076.0,796.0,96.0,708.0,2079.0,...,1073.0,2431.0,1964.0,3438.0,1465.0,1789.0,1609.0,3306.0,403.0,1582.0
6,2004,1184.0,159.0,2261.0,2298.0,1644.0,383.0,279.0,174.0,1799.0,...,1075.0,1683.0,1778.0,3437.0,920.0,1457.0,2955.0,2864.0,190.0,1003.0
7,2005,984.0,217.0,1271.0,1657.0,2358.0,680.0,92.0,121.0,1709.0,...,501.0,1520.0,2211.0,2297.0,1678.0,860.0,1483.0,2123.0,200.0,1398.0
8,2006,1221.0,161.0,817.0,998.0,2509.0,1109.0,76.0,215.0,1294.0,...,1183.0,1268.0,2358.0,2501.0,680.0,1245.0,1855.0,2209.0,147.0,1002.0
9,2007,1213.0,128.0,440.0,590.0,2210.0,1712.0,274.0,382.0,888.0,...,1833.0,1631.0,2714.0,2067.0,396.0,1863.0,1230.0,1877.0,124.0,2160.0


In [137]:
#examining top 10 states with the most fires reported (please igone the year observation, will be removed later)
time_plot_2_df.sum().nlargest(11)

State
Mato Grosso     96245.0
Paraiba         52437.0
Sao Paulo       51120.0
Rio             45160.0
Bahia           44747.0
Piau            37805.0
Goias           37694.0
Minas Gerais    37475.0
Tocantins       33707.0
Amazonas        30652.0
Ceara           30429.0
dtype: float64

Now, we know which states (top 10) are generating the most fire reports. Let's visualize those numbers to get even a better understanding!

In [138]:
#creating a dataframe for bar plot visualization
bar_plot_df = pd.DataFrame(time_plot_2_df.sum().nlargest(11))
#reseting index for first column
bar_plot_df=bar_plot_df.reset_index()

In [139]:
bar_plot_df

Unnamed: 0,State,0
0,Mato Grosso,96245.0
1,Paraiba,52437.0
2,Sao Paulo,51120.0
3,Rio,45160.0
4,Bahia,44747.0
5,Piau,37805.0
6,Goias,37694.0
7,Minas Gerais,37475.0
8,Tocantins,33707.0
9,Amazonas,30652.0


In [140]:
#renaming
bar_plot_df.rename(columns={'index':'State', 0:'Reported_Fires'}, inplace=True)

In [141]:
bar_plot_df

Unnamed: 0,State,Reported_Fires
0,Mato Grosso,96245.0
1,Paraiba,52437.0
2,Sao Paulo,51120.0
3,Rio,45160.0
4,Bahia,44747.0
5,Piau,37805.0
6,Goias,37694.0
7,Minas Gerais,37475.0
8,Tocantins,33707.0
9,Amazonas,30652.0


In [142]:
#removing Year observation
bar_plot_df.drop(bar_plot_df[bar_plot_df.State == 'Year'].index, inplace=True)
#checking dataframe
bar_plot_df

Unnamed: 0,State,Reported_Fires
0,Mato Grosso,96245.0
1,Paraiba,52437.0
2,Sao Paulo,51120.0
3,Rio,45160.0
4,Bahia,44747.0
5,Piau,37805.0
6,Goias,37694.0
7,Minas Gerais,37475.0
8,Tocantins,33707.0
9,Amazonas,30652.0


In [143]:
#making barplot
bar_plot=px.bar(bar_plot_df, x='State', y='Reported_Fires', color='Reported_Fires',
           labels={'Reported_Fires':'Count of reported fires ', 'State':'States'}, color_continuous_scale='Reds')
#making layout changes
bar_plot.update_layout(xaxis_tickangle=-45, title_text='Top 10 States for Amount of Reported Fires per 1998-2017 Years')
#outputing plot
bar_plot.show()

Take a moment to hover over the graph to explore the dynamic features of Plotly.

In [156]:
#preparing a figure that will be populated 
time_plot_2 = go.Figure()
#adding individual graphs to the figure
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Mato Grosso'],
                                 mode='lines+markers', name='Mato Grosso', line={'color': 'red'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Paraiba'],
                                 mode='lines+markers', name='Paraiba', line={'color': 'yellow'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Sao Paulo'],
                                 mode='lines+markers', name='Sao Paulo', line={'color': 'green'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Rio'],
                                 mode='lines+markers', name='Rio', line={'color': 'blue'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Bahia'],
                                 mode='lines+markers', name='Bahia', line={'color': 'pink'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Piau'],
                                 mode='lines+markers', name='Piau', line={'color': 'brown'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Goias'],
                                 mode='lines+markers', name='Goias', line={'color': 'grey'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Minas Gerais'],
                                 mode='lines+markers', name='Minas Gerais', line={'color': 'purple'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Tocantins'],
                                 mode='lines+markers', name='Tocantins', line={'color': 'orange'}))
time_plot_2.add_trace(go.Scatter(x=time_plot_2_df.Year, y=time_plot_2_df['Amazonas'],
                                 mode='lines+markers', name='Amazonas', line={'color': 'gold'}))
#making changes to layout
time_plot_2.update_layout(title='Brazil Fires in Top-10 (frequent) regions per 1998-2017 Years',
                   xaxis_title='Year',
                   yaxis_title='Fires')
#outputing plot
time_plot_2.show()

In [157]:
#creating subdataframe for visualizing this states geographically
geo_plot_df=pd.DataFrame(time_plot_2_df.sum().nlargest(11))
#formatting new dataframe
geo_plot_df.rename(columns={0:'Count'}, inplace=True)
geo_plot_df.reset_index(inplace=True)
geo_plot_df.rename(columns={'index':'State'}, inplace=True)
geo_plot_df

Unnamed: 0,State,Count
0,Mato Grosso,96245.0
1,Paraiba,52437.0
2,Sao Paulo,51120.0
3,Rio,45160.0
4,Bahia,44747.0
5,Year,40150.0
6,Piau,37805.0
7,Goias,37694.0
8,Minas Gerais,37475.0
9,Tocantins,33707.0


In [158]:
geo_plot_df.drop(geo_plot_df.index[5], inplace=True)
#cheking new sub dataframe 
geo_plot_df

Unnamed: 0,State,Count
0,Mato Grosso,96245.0
1,Paraiba,52437.0
2,Sao Paulo,51120.0
3,Rio,45160.0
4,Bahia,44747.0
6,Piau,37805.0
7,Goias,37694.0
8,Minas Gerais,37475.0
9,Tocantins,33707.0
10,Amazonas,30652.0


In [159]:
#taking my time and adding all coordinates (latitude and longitude) for this top 10 states
lat=[-16.350000, -22.15847, -23.533773, -22.908333, -11.409874, -21.5089, -16.328547,
     -19.841644, -21.175, -3.416843]
long=[-56.666668, -43.29321, -46.625290, -43.196388, -41.280857, -43.3228, -48.953403,
     -43.986511, -43.01778, -65.856064]
#adding new coordinates as columns to subdataframe above
geo_plot_df['Lat']=lat
geo_plot_df['Long']=long
#checking changes in subdataframe for geo visualization
geo_plot_df

Unnamed: 0,State,Count,Lat,Long
0,Mato Grosso,96245.0,-16.35,-56.666668
1,Paraiba,52437.0,-22.15847,-43.29321
2,Sao Paulo,51120.0,-23.533773,-46.62529
3,Rio,45160.0,-22.908333,-43.196388
4,Bahia,44747.0,-11.409874,-41.280857
6,Piau,37805.0,-21.5089,-43.3228
7,Goias,37694.0,-16.328547,-48.953403
8,Minas Gerais,37475.0,-19.841644,-43.986511
9,Tocantins,33707.0,-21.175,-43.01778
10,Amazonas,30652.0,-3.416843,-65.856064


In [160]:
#using scatter geo with above created subdataframe
fig = px.scatter_geo(data_frame=geo_plot_df, scope='south america',lat='Lat',lon='Long',
                     size='Count', color='State', projection='hammer')
fig.update_layout(
        title_text = '1998-2017 Top-10 States in Brazil with reported fires')
fig.show()

In [161]:
#according to different sources, months from June - November are the hottes in Brazil

#isolating the hottest months by season
month_array_summer=['June','July','August']
month_array_fall=['September','October','November']
#leaving data only for hottest months
box_plot_df_summer=amazon_df.loc[amazon_df['Month'].isin(month_array_summer)]
box_plot_df_fall=amazon_df.loc[amazon_df['Month'].isin(month_array_fall)]
#visualizing reports
box_plot=go.Figure()

box_plot.add_trace(go.Box(y=box_plot_df_summer.Fire_Number, x=box_plot_df_summer.Month,
                          name='Summer', marker_color='#3D9970',
                          boxpoints='all', jitter=0.5, whiskerwidth=0.2,
                          marker_size=2,line_width=2))
box_plot.add_trace(go.Box(y=box_plot_df_fall.Fire_Number, x=box_plot_df_fall.Month,
                         name='Fall', marker_color='#FF851B',
                         boxpoints='all', jitter=0.5, whiskerwidth=0.2,
                          marker_size=2,line_width=2))

box_plot.update_layout(
        title_text = 'Distribution of Fire Reports from 1998-2017 in the hottest months')
box_plot.show()