In [2]:
import pandas as pd
import numpy as np
import re
from zipfile import ZipFile

In [24]:
#Fix Numbers
def fix_numbers(x): 
    x=str(x); x=x.replace(',','.')
    try:
        x=float(x)
        if x<-100:
            return np.nan
        else:
            return x
    except:
        return np.nan

def sum2(x):
    if all(x.isnull()):
        return np.nan
    else:
        return np.sum(x)
    
#Fix date
def fix_date(x):
    x=str(x)
    x=x.replace('/','-')
    return x

In [25]:
Column_Names={'DATA (YYYY-MM-DD)':'date',
              'Data':'date',
              'HORA (UTC)':'hour',
               'Hora UTC':'hour',
               'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)':'rain_mm',
               'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)':'temp_max',
               'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)':'temp_min'}

years=range(2000,2021)
All_DF=np.zeros([0,20])
for year in years:
    print(year)
    with ZipFile('Weather History/'+str(year)+'.zip') as myzip:
        path_documents=[name for name in myzip.namelist() if 'INMET_SE' in name]
        for path in path_documents:
            base_name_match=re.search('SE_(\w\w)_([ACFS]\d+)_(.*?)_',path)
            with myzip.open(path) as myfile:
                DF=pd.read_csv(myfile,delimiter=';',skiprows=8,encoding='latin')
                DF['state']=base_name_match[1]
                DF['station']=base_name_match[2]+' '+base_name_match[3]
                
                DF.rename(Column_Names,axis=1,inplace=True)
                DF=DF[['state','station','date','hour','rain_mm', 'temp_max','temp_min']]
                
                DF['rain_mm']=DF['rain_mm'].apply(fix_numbers)
                DF['temp_max']=DF['temp_max'].apply(fix_numbers)
                DF['temp_min']=DF['temp_min'].apply(fix_numbers)
                
                DF['date']=DF['date'].apply(fix_date)
                DF['date']=pd.to_datetime(DF['date'])
                
                DF=DF.groupby(['state','station','date']).\
                      agg({'rain_mm':sum2,'temp_max':'max','temp_min':'min'}).reset_index()             
                
                if All_DF.shape[0]==0:
                    All_DF=DF
                else:
                    All_DF=pd.concat([All_DF,DF],ignore_index=True)
All_DF.head()

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


Unnamed: 0,state,station,date,rain_mm,temp_max,temp_min
0,RJ,A601 ECOLOGIA AGRICOLA,2000-05-07,0.0,18.9,16.4
1,RJ,A601 ECOLOGIA AGRICOLA,2000-05-08,0.0,19.3,16.4
2,RJ,A601 ECOLOGIA AGRICOLA,2000-05-09,1.2,21.8,16.1
3,RJ,A601 ECOLOGIA AGRICOLA,2000-05-10,0.0,17.5,11.7
4,RJ,A601 ECOLOGIA AGRICOLA,2000-05-11,25.6,18.6,13.1


In [1]:
Missing=All_DF.groupby('station').agg(lambda x: np.floor(sum(x.isnull())/len(x)*100)).sort_values('rain_mm',ascending=False)
Missing

NameError: name 'All_DF' is not defined

In [34]:
#Delete CRIOSFERA and EB because they have too much missing data
All_DF=All_DF[(All_DF.station!='C891 CRIOSFERA') & (All_DF.station!='S122 EB')]

In [36]:
All_DF.dtypes

state               object
station             object
date        datetime64[ns]
rain_mm            float64
temp_max           float64
temp_min           float64
dtype: object

In [37]:
All_DF.describe()

Unnamed: 0,rain_mm,temp_max,temp_min
count,582477.0,591452.0,591107.0
mean,3.500777,18.449717,13.162906
std,10.098886,3.770241,4.886949
min,0.0,-9.9,-25.0
25%,0.0,16.0,10.2
50%,0.0,19.0,14.0
75%,1.2,21.2,16.8
max,318.8,44.8,30.2


In [39]:
All_DF.to_csv('ima2021_project/Data/historic_weather_all_SE_daily.csv.zip',index=False)

In [3]:
All_DF=pd.read_csv('ima2021_project/Data/historic_weather_all_SE_daily.csv.zip')

In [8]:
Missing=All_DF.groupby('station').agg(lambda x: np.floor(sum(x.isnull()))).sort_values('rain_mm',ascending=False)
Missing.sort_values('rain_mm',ascending=False).head(20)

Unnamed: 0_level_0,state,date,rain_mm,temp_max,temp_min
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A706 CAMPOS DO JORDAO,0.0,0.0,1585.0,1539.0,1539.0
A746 BARRA DO TURVO,0.0,0.0,1293.0,809.0,809.0
A708 FRANCA,0.0,0.0,949.0,657.0,657.0
A737 IBITINGA,0.0,0.0,936.0,259.0,259.0
A741 BARRA BONITA,0.0,0.0,900.0,112.0,112.0
A603 XEREM,0.0,0.0,852.0,1215.0,1215.0
A604 CAMBUCI,0.0,0.0,828.0,771.0,771.0
A705 BAURU,0.0,0.0,794.0,758.0,758.0
A610 PICO DO COUTO,0.0,0.0,752.0,759.0,760.0
A725 AVARE,0.0,0.0,667.0,272.0,272.0


In [15]:
All_DF[['BARRA BONITA' in station for station in All_DF.station]].station.unique()

array(['A741 BARRA BONITA'], dtype=object)