In [1]:
import pandas as pd
import numpy as np
import re
from zipfile import ZipFile

In [2]:
#Fix Numbers
def fix_numbers(x): 
    x=str(x); x=x.replace(',','.')
    try:
        x=float(x)
        if x<-100:
            return np.nan
        else:
            return x
    except:
        return np.nan

def sum2(x):
    if all(x.isnull()):
        return np.nan
    else:
        return np.sum(x)
    
#Fix date
def fix_date(x):
    x=str(x)
    x=x.replace('/','-')
    return x

In [9]:
Column_Names={'DATA (YYYY-MM-DD)':'date',
              'Data':'date',
              'HORA (UTC)':'hour',
               'Hora UTC':'hour',
               'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)':'rain_mm',
               'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)':'temp_max',
               'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)':'temp_min'}

years=range(2000,2021)
All_DF=np.zeros([0,20])
for year in years:
    print(year)
    with ZipFile('Weather History/'+str(year)+'.zip') as myzip:
        path_documents=[name for name in myzip.namelist() if 'INMET_CO' in name]
        for path in path_documents:
            base_name_match=re.search('CO_(\w\w)_([ACFS]\d+)_(.*?)_',path)
            with myzip.open(path) as myfile:
                DF=pd.read_csv(myfile,delimiter=';',skiprows=8,encoding='latin')
                DF['state']=base_name_match[1]
                DF['station']=base_name_match[2]+' '+base_name_match[3]
                
                DF.rename(Column_Names,axis=1,inplace=True)
                DF=DF[['state','station','date','hour','rain_mm', 'temp_max','temp_min']]
                
                DF['rain_mm']=DF['rain_mm'].apply(fix_numbers)
                DF['temp_max']=DF['temp_max'].apply(fix_numbers)
                DF['temp_min']=DF['temp_min'].apply(fix_numbers)
                
                DF['date']=DF['date'].apply(fix_date)
                DF['date']=pd.to_datetime(DF['date'])
                
                DF=DF.groupby(['state','station','date']).\
                      agg({'rain_mm':sum2,'temp_max':'max','temp_min':'min'}).reset_index()             
                
                if All_DF.shape[0]==0:
                    All_DF=DF
                else:
                    All_DF=pd.concat([All_DF,DF],ignore_index=True)
All_DF.head()

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


Unnamed: 0,state,station,date,rain_mm,temp_max,temp_min
0,DF,A001 BRASILIA,2000-05-07,0.0,16.1,11.5
1,DF,A001 BRASILIA,2000-05-08,0.0,17.0,9.4
2,DF,A001 BRASILIA,2000-05-09,0.0,16.2,11.9
3,DF,A001 BRASILIA,2000-05-10,0.0,16.6,11.3
4,DF,A001 BRASILIA,2000-05-11,0.0,14.8,9.8


In [10]:
Missing=All_DF.groupby('station').agg(lambda x: np.floor(sum(x.isnull())/len(x)*100)).sort_values('rain_mm',ascending=False)
Missing

Unnamed: 0_level_0,state,date,rain_mm,temp_max,temp_min
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S703 BANDEIRANTES,0.0,0.0,73.0,73.0,73.0
S708 FATIMA DO SUL,0.0,0.0,65.0,83.0,83.0
S713 NOVA ANDRADINA,0.0,0.0,64.0,64.0,64.0
S707 CAMAPUA,0.0,0.0,63.0,96.0,96.0
A944 ROSARIO OESTE,0.0,0.0,58.0,98.0,98.0
...,...,...,...,...,...
A036 CRISTALINA,0.0,0.0,0.0,0.0,0.0
A042 BRAZLANDIA,0.0,0.0,0.0,0.0,0.0
A056 CRISTALINA (FAZENDA SANTA MONICA),0.0,0.0,0.0,0.0,0.0
A047 PARANOA (COOPA-DF),0.0,0.0,0.0,0.0,0.0


In [11]:
All_DF.dtypes

state               object
station             object
date        datetime64[ns]
rain_mm            float64
temp_max           float64
temp_min           float64
dtype: object

In [12]:
All_DF.describe()

Unnamed: 0,rain_mm,temp_max,temp_min
count,400241.0,418716.0,418471.0
mean,3.63774,19.763458,14.340233
std,10.067716,4.355617,5.491874
min,0.0,-10.0,-10.0
25%,0.0,17.2,10.7
50%,0.0,20.8,15.7
75%,1.2,23.0,18.7
max,223.8,44.9,35.9


In [14]:
All_DF.to_csv('ima2021_project/Data/historic_weather_all_CO_daily.csv.zip',index=False)

In [3]:
#All_DF=pd.read_csv('ima2021_project/Data/historic_weather_all_CO_daily.csv.zip')

In [15]:
Missing=All_DF.groupby('station').agg(lambda x: np.floor(sum(x.isnull()))).sort_values('rain_mm',ascending=False)
Missing.sort_values('rain_mm',ascending=False).head(20)

Unnamed: 0_level_0,state,date,rain_mm,temp_max,temp_min
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A905 CAMPO NOVO DOS PARECIS,0.0,0.0,1910.0,1759.0,1795.0
A915 PARANATINGA,0.0,0.0,1746.0,1563.0,1563.0
A935 PORTO ESTRELA,0.0,0.0,1621.0,677.0,677.0
A704 TRES LAGOAS,0.0,0.0,1560.0,1688.0,1688.0
A906 GUARANTA DO NORTE,0.0,0.0,1470.0,1076.0,1104.0
A709 IVINHEMA,0.0,0.0,1470.0,1166.0,1166.0
A904 SORRISO,0.0,0.0,1447.0,678.0,710.0
A921 SAO FELIX DO ARAGUAIA,0.0,0.0,1374.0,1744.0,1750.0
A723 PORTO MURTINHO,0.0,0.0,1262.0,349.0,349.0
A929 NOVA UBIRATA,0.0,0.0,1226.0,408.0,408.0
