In [1]:
import pandas as pd
#amazon fire dataset
url='https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv'
df= pd.read_csv(url , encoding= "ISO-8859-1")

In [3]:
new_columns = {'ano': 'year',
                  'estado': 'state',
                  'mes': 'month',
                  'numero': 'no_of_fires',
                  'encontro': 'date'}

df.rename(columns = new_columns, inplace= True)
df.head()

Unnamed: 0,year,month,state,no_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


# 1. removing unncessary text from columns 

In [4]:
#to replace column with cleaned column

df['no_of_fires'].str.strip("Fires") #str.strip() method is used to remove spaces from both left and right side of the string.
df.head()

Unnamed: 0,year,month,state,no_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


In [6]:
#we  need to no_of_fires to float
#alternative way for strig manipulation

#df["no_of_fires"].str.replace('','0').astype(float)


# 2. handling missing data

In [7]:
#reloading the data frame

import pandas as pd
url='https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv'
df= pd.read_csv(url , encoding= "ISO-8859-1")

new_columns = {'ano': 'year',
                  'estado': 'state',
                  'mes': 'month',
                  'numero': 'no_of_fires',
                  'encontro': 'date'}

df.rename(columns = new_columns, inplace= True)

df['no_of_fires'] = df['no_of_fires'].str.strip('Fires')

#creating a true copy of our dataframe

df_copy = df.copy()
df.head()

Unnamed: 0,year,month,state,no_of_fires,date
0,1998,Janeiro,Acre,0,1/1/1998
1,1999,Janeiro,Acre,0,1/1/1999
2,2000,Janeiro,Acre,0,1/1/2000
3,2001,Janeiro,Acre,0,1/1/2001
4,2002,Janeiro,Acre,0,1/1/2002



# What to do with missing data?
1. remove them via .dropna(axis=0)
2. replace them with an arbitrary number(for e.g. avg)
3. replace with zeros, or forward fill/back fill

In [8]:
#using fillna with zeros

df['no_of_fires'].fillna(0).head()

0    0 
1    0 
2    0 
3    0 
4    0 
Name: no_of_fires, dtype: object

In [11]:
#backfill

df['no_of_fires'] = df['no_of_fires'].fillna(method='backfill').head(70)


In [12]:
#forward fill

df['no_of_fires'] = df['no_of_fires'].fillna(method='ffill').head(70)

# 3. assigning data types to our columns

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
year           6454 non-null int64
month          6454 non-null object
state          6454 non-null object
no_of_fires    70 non-null object
date           6454 non-null object
dtypes: int64(1), object(4)
memory usage: 252.2+ KB


In [16]:
df['no_of_fires'] = df['no_of_fires'].str.replace('','0').astype(int)

ValueError: invalid literal for int() with base 10: '000 0'

In [19]:
#replacing text in columns 

df['month'].unique()

array(['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho',
       'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro'],
      dtype=object)

In [20]:
month_translations = {  'Janeiro': 'January'
                      , 'Fevereiro':'February'
                      , 'Março':"March"
                      , 'Abril':'April'
                      , 'Maio':'May'
                      , 'Junho':'June'
                      , 'Julho':'July'
                      ,'Agosto':'August'
                      ,'Setembro':'September'
                      ,'Outubro':'October'
                      ,'Novembro':'November'
                      ,'Dezembro':'December'
    
}

df['month'] = df['month'].map(month_translations)
df.head()

Unnamed: 0,year,month,state,no_of_fires,date
0,1998,January,Acre,0,1/1/1998
1,1999,January,Acre,0,1/1/1999
2,2000,January,Acre,0,1/1/2000
3,2001,January,Acre,0,1/1/2001
4,2002,January,Acre,0,1/1/2002


In [22]:
#further string operations on columns

df['state']= df['state'].str.title()
df['state'].unique()

array(['Acre', 'Alagoas', 'Amapa', 'Amazonas', 'Bahia', 'Ceara',
       'Distrito Federal', 'Espirito Santo', 'Goias', 'Maranhao',
       'Mato Grosso', 'Minas Gerais', 'Pará', 'Paraiba', 'Pernambuco',
       'Piau', 'Rio', 'Rondonia', 'Roraima', 'Santa Catarina',
       'Sao Paulo', 'Sergipe', 'Tocantins'], dtype=object)

In [24]:
#removing columns

df.head()

Unnamed: 0,year,month,state,no_of_fires,date
0,1998,January,Acre,0,1/1/1998
1,1999,January,Acre,0,1/1/1999
2,2000,January,Acre,0,1/1/2000
3,2001,January,Acre,0,1/1/2001
4,2002,January,Acre,0,1/1/2002


In [25]:
#dropping multiple columns

df = df.drop('date',axis=1)

df.head()

Unnamed: 0,year,month,state,no_of_fires
0,1998,January,Acre,0
1,1999,January,Acre,0
2,2000,January,Acre,0
3,2001,January,Acre,0
4,2002,January,Acre,0


In [32]:
df = df.drop(['state','month'],axis=1)

In [33]:
df.head()

Unnamed: 0,year,no_of_fires
0,1998,0
1,1999,0
2,2000,0
3,2001,0
4,2002,0


# 4. droppig rows

In [34]:
import pandas as pd
#amazon fire dataset
url='https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/amazon_fires.csv'
df= pd.read_csv(url , encoding= "ISO-8859-1")

In [35]:
new_columns = {'ano': 'year',
                  'estado': 'state',
                  'mes': 'month',
                  'numero': 'no_of_fires',
                  'encontro': 'date'}

df.rename(columns = new_columns, inplace= True)
df.head()

Unnamed: 0,year,month,state,no_of_fires,date
0,1998,Janeiro,Acre,0 Fires,1/1/1998
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002


In [36]:
#dropping the 1st row

df = df.drop(df.index[0])

In [37]:
df.head()

Unnamed: 0,year,month,state,no_of_fires,date
1,1999,Janeiro,Acre,0 Fires,1/1/1999
2,2000,Janeiro,Acre,0 Fires,1/1/2000
3,2001,Janeiro,Acre,0 Fires,1/1/2001
4,2002,Janeiro,Acre,0 Fires,1/1/2002
5,2003,Janeiro,Acre,10 Fires,1/1/2003


In [41]:
#droppping multiple rows

df = df.drop(df.index[[2,10]])
df= df.reset_index()
df.head(10)

Unnamed: 0,index,year,month,state,no_of_fires,date
0,1,1999,Janeiro,Acre,0 Fires,1/1/1999
1,2,2000,Janeiro,Acre,0 Fires,1/1/2000
2,6,2004,Janeiro,Acre,0 Fires,1/1/2004
3,7,2005,Janeiro,Acre,12 Fires,1/1/2005
4,8,2006,Janeiro,Acre,4 Fires,1/1/2006
5,9,2007,Janeiro,Acre,0 Fires,1/1/2007
6,10,2008,Janeiro,Acre,0 Fires,1/1/2008
7,12,2010,Janeiro,Acre,1 Fires,1/1/2010
8,14,2012,Janeiro,Acre,0 Fires,1/1/2012
9,16,2014,Janeiro,Acre,0 Fires,1/1/2014


In [48]:
#droppinng range of rows

df = df.drop(df.index[2:10], axis=0)
df.head(10)

Unnamed: 0,level_0,index,year,month,state,no_of_fires,date
0,0,1,1999,Janeiro,Acre,0 Fires,1/1/1999
1,1,2,2000,Janeiro,Acre,0 Fires,1/1/2000
28,30,37,2015,Fevereiro,Acre,2 Fires,1/1/2015
29,31,38,2016,Fevereiro,Acre,5 Fires,1/1/2016
30,32,39,2017,Fevereiro,Acre,1 Fires,1/1/2017
31,33,40,1998,Março,Acre,0 Fires,1/1/1998
32,34,41,1999,Março,Acre,0 Fires,1/1/1999
33,35,42,2000,Março,Acre,11 Fires,1/1/2000
34,36,43,2001,Março,Acre,0 Fires,1/1/2001
35,37,44,2002,Março,Acre,0 Fires,1/1/2002
