In [1]:
import pandas as pd
import my_functions

## Importing data

In [3]:
path = './data' # Relative path 

# Dictionary to change column names to english
column_dic = {'DataHora' :'Date',
              'Satelite' :'Satellite', 
              'Pais' :'Country',
              'Estado' :'State',
              'Municipi' :'County',
              'Bioma' :'Biome',
              'DiaSemCh' :'Days_without_rain',
              'Precipit' :'Precipitation',
              'RiscoFog' :'Fire_risk',
              'AreaIndu' :'Industrial_area'}

## Calling a function from a other python file
result = my_functions.import_multiple_csv(path, '/Focos',
                                          column_sep=';', 
                                          col_name_dic = column_dic,
                                          col_dtypes_dic = {'Precipit': str})

# Exploring the data
These are the methods I usually use to explore a new data set
    1. df.shape 
    2. df.head() or df.sample(5) 
    3. df.dtypes or df.info()
    4. df.isnull().sum()
    5. df.describe()

In [6]:
result.shape # Checking number of rows and columns in the df

(9022220, 13)

In [7]:
result.head() # Printing the first 5 rows

Unnamed: 0,Date,Satellite,Country,State,County,Biome,Days_without_rain,Precipitation,Fire_risk,Latitude,Longitud,Industrial_area,FRP
0,2007/01/01 02:51:00,TERRA_M-M,Brasil,RORAIMA,CANTA,Amazonia,,,,2.291,-60.247,,
1,2007/01/01 02:51:00,TERRA_M-M,Brasil,RORAIMA,CARACARAI,Amazonia,,,,1.937,-61.062,,
2,2007/01/01 04:09:00,AQUA_M-M,Brasil,MARANHAO,CANDIDO MENDES,Amazonia,,,,-1.387,-45.614,,
3,2007/01/01 13:34:00,TERRA_M-T,Brasil,PARA,AUGUSTO CORREA,Amazonia,,,,-1.266,-46.501,,
4,2007/01/01 13:34:00,TERRA_M-T,Brasil,MARANHAO,CANDIDO MENDES,Amazonia,,,,-1.404,-45.637,,


In [8]:
result.dtypes # Check if data types are what you expect

Date                  object
Satellite             object
Country               object
State                 object
County                object
Biome                 object
Days_without_rain    float64
Precipitation         object
Fire_risk            float64
Latitude             float64
Longitud             float64
Industrial_area      float64
FRP                  float64
dtype: object

In [9]:
result.isnull().sum() # Check number of null rows per column

Date                       0
Satellite                  0
Country                    0
State                      0
County                     0
Biome                      0
Days_without_rain    5961261
Precipitation        3589306
Fire_risk            3707901
Latitude                   0
Longitud                   0
Industrial_area      9022220
FRP                  8087233
dtype: int64

In [10]:
result.select_dtypes(include=['object']).describe()

Unnamed: 0,Date,Satellite,Country,State,County,Biome,Precipitation
count,9022220,9022220,9022220,9022220,9022220,9022220,5432914
unique,562618,28,1,9,547,1,5382
top,2019/08/14 17:18:00,NPP-375,Brasil,PARA,SAO FELIX DO XINGU,Amazonia,0
freq,8273,4324187,9022220,3540028,336070,9022220,4301659


In [11]:
result.select_dtypes(include=['float64']).describe()

Unnamed: 0,Days_without_rain,Fire_risk,Latitude,Longitud,Industrial_area,FRP
count,3060959.0,5314319.0,9022220.0,9022220.0,0.0,934987.0
mean,5.372951,0.6822999,-6.779385,-55.5544,,20.689653
std,11.85245,0.3772227,4.155135,6.253647,,62.18685
min,0.0,0.0,-16.29,-73.93146,,0.0
25%,0.0,0.35,-9.86667,-60.40891,,3.2
50%,0.0,0.89,-7.4,-54.857,,7.6
75%,5.0,1.0,-3.57,-50.642,,18.1
max,118.0,1.0,5.23,-43.64537,,5270.4


## Problems found:
Date column is as object

In [13]:
# Formating data column
result['Date'] = pd.to_datetime(result['Date'], format='%Y/%m/%d %H:%M:%S')

## Creating Auxiliary columns

In [14]:
# Creating string columns with Month and one with Year
result['Month'] = result['Date'].dt.strftime('%m').astype(int)
result['Year'] = result['Date'].dt.strftime('%Y').astype(int)

## Counting observations

In [15]:
print(f'Number of states {result.State.nunique()}')
print(f'Number of counties {result.County.nunique()}')
print(f'Number of sattelites {result.Satellite.nunique()}')

Number of states 9
Number of counties 547
Number of sattelites 28


In [17]:
# Checking how much data for each satelite
result['Satellite'].value_counts()

NPP-375      4324187
AQUA_M-T     1209183
GOES-16       743387
TERRA_M-T     401606
GOES-13       330726
TERRA_M-M     316313
GOES-12       284286
NOAA-15       274926
NOAA-18       261378
GOES-10       152552
AQUA_M-M      142343
NOAA-19       134193
METOP-B        80291
NOAA-16        59856
MSG-03         57998
NOAA-19D       49494
NOAA-18D       45282
TRMM           44189
MSG-02         43496
ATSR           17345
NOAA-17        16202
NOAA-12        11991
NOAA-16N       10045
NOAA-15D        7049
NOAA-20         3006
NOAA-20D         382
NOAA-12D         296
NOAA-14          218
Name: Satellite, dtype: int64

In [18]:
# Counting how many observations for each month
result['Month'].value_counts()

9     2491734
8     2154649
10    1473295
11    1047277
12     572259
7      460571
3      200431
6      197719
1      163112
2       90450
4       87751
5       82972
Name: Month, dtype: int64

In [19]:
# Counting how many observations for each year
result['Year'].value_counts()

2017    1434113
2015    1257284
2012     921492
2018     917148
2016     892350
2014     770698
2019     710145
2007     548336
2013     485744
2010     427791
2008     286255
2009     207717
2011     163147
Name: Year, dtype: int64