In [1]:
from zipfile import ZipFile
import pandas as pd
import calendar

In [2]:
def extract_from_zip_folder(file_type, zip_folder_path, new_folder_path):
    '''This function extracts files from a zip folder.
    It just extracts the files of a certain type
    '''
    with ZipFile(zip_folder_path, 'r') as zip_obj:
       # Get a list of all archived file names from the zip
       list_of_file_names = zip_obj.namelist()
       # Iterate over the file names
       for file_name in list_of_file_names:
           # Check filename endswith csv
           if file_name.endswith(f'.{file_type}'):
               # Extract a single file from zip
               zip_obj.extract(file_name, new_folder_path)

In [3]:
extract_from_zip_folder('csv', 'data/Anio201810.zip', 'csv_data')

In [4]:
data = pd.read_csv('csv_data/abr_mo18.csv', sep=';')
data.head()

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,MAGNITUD,PUNTO_MUESTREO,ANO,MES,DIA,H01,V01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24
0,28,79,4,1,28079004_1_38,2018,4,1,2.0,V,...,2.0,V,2.0,V,3.0,V,4.0,V,3.0,V
1,28,79,4,1,28079004_1_38,2018,4,2,2.0,V,...,2.0,V,2.0,V,2.0,V,2.0,V,2.0,V
2,28,79,4,1,28079004_1_38,2018,4,3,2.0,V,...,2.0,V,2.0,V,2.0,V,2.0,V,2.0,V
3,28,79,4,1,28079004_1_38,2018,4,4,2.0,V,...,2.0,V,2.0,V,2.0,V,2.0,V,2.0,V
4,28,79,4,1,28079004_1_38,2018,4,5,2.0,V,...,2.0,V,2.0,V,4.0,V,5.0,V,4.0,V


In [5]:
data.shape

(4490, 56)

In [6]:
len(set(data[data['MAGNITUD'] == 8]['PUNTO_MUESTREO']))

24

In [7]:
def add_missing_days(dataframe):
    '''This function takes the monthly dataframe and checks if there are missing days
    for a specific sample spot. If so, it appends a row to the original dataframe with
    the info of that day and the validation columns set to N (non validated), to know that
    info is not correct (we will correct it later).
    '''
    year = dataframe.loc[0, 'ANO']
    month = dataframe.loc[0, 'MES']
    
    # First we have to know how many days a specific month has:
    _, number_days_month = calendar.monthrange(year, month)
    
    # We create a list with all the days of that month
    list_of_days_of_the_month = list(range(1, number_days_month + 1))
    
    # We create a list with all the sample spots
    sample_spots_list = list(set(dataframe['PUNTO_MUESTREO']))
    
    for sample_spot in sample_spots_list:
        print(f'Checking sample spot: {sample_spot}')
        # We create a df with just the info of one spot
        sample_spot_df = dataframe[dataframe['PUNTO_MUESTREO'] == sample_spot].reset_index()

        # We check if all that days are contained in the spot df
        isin_df = pd.Series(list_of_days_of_the_month).isin(list(sample_spot_df['DIA']))
        isin_df.index = list_of_days_of_the_month

        # Now, if a day is not included, we append a row with its data to the original df
        for day, isin in isin_df.iteritems():
            if isin == False:
                print(f'Day {day}-{month}-{year} missing')
                
                # We take the first row of the df, but we change the day and the validation columns to 'N'
                row_to_append = [sample_spot_df.loc[0, column] for column in sample_spot_df.columns]
                row_to_append[8] = day
                for i, e in enumerate(row_to_append):
                    if e == 'V':
                        row_to_append[i] = 'N'
                print(row_to_append)
                
                # We append the row
                dataframe = dataframe.append(pd.Series(row_to_append, index=sample_spot_df.columns), ignore_index=True)
                print(f'Day {day}-{month}-{year} row added to original dataframe')
            
    return dataframe

In [8]:
def get_stacked_dataframe(dataframe, cols_to_drop, cols_remain):
    '''This function applies the pandas stack method to make data that is
    spread in columns collapse in a single column.
    First drops the columns that would not let the stack work properly.
    Then sets the columns that do not have to be stacked as the index.
    Applies stack method. Finally, resets index.
    '''
    dataframe = dataframe.drop(columns=cols_to_drop)
    dataframe = dataframe.set_index(cols_remain)
    dataframe = dataframe.stack().reset_index()
    dataframe = dataframe[dataframe[dataframe.columns[-2]] != 'index'].reset_index()
    dataframe = dataframe.drop(columns='index')
    return dataframe

In [9]:
def add_last_col_to_df(df1, df2):
    ''' Adds the last column from a dataframe to another dataframe with the same number of rows'''
    df1['new_col'] = df2.iloc[:,-1]
    return df1

In [10]:
def get_reshaped_df(dataframe):
    '''Gets a df, keeps just the NO2 info, splits it into 2 dataframes,
    each of them with one of the columns that we want to stack,
    joins them into a single dataframe, renames columns and formats HORA column.
    The result is a much easier to use dataframe'''
    
    print(f'Dataframe shape: {dataframe.shape}')
    
    print('Keeping just NO2 data')
    dataframe = dataframe[dataframe['MAGNITUD'] == 8].drop(columns=['MAGNITUD']).reset_index(drop=True)
    
    print(f'Dataframe shape: {dataframe.shape}')
    
    print('Adding missing days rows')
    # We need a list of the sample spots
    list_of_sample_spots = list(set(dataframe['PUNTO_MUESTREO']))
    print(f'{len(list_of_sample_spots)} sample spots')
    
    # We apply the function that add records of missing days
    dataframe = add_missing_days(dataframe)
    
    print(f'Dataframe shape: {dataframe.shape}')
    
    cols_dimensiones = ['PROVINCIA', 'MUNICIPIO', 'ESTACION', 'PUNTO_MUESTREO', 'ANO', 'MES', 'DIA']
    
    print('Stacking dataframes')
    df_h = get_stacked_dataframe(
        dataframe,
        cols_remain=cols_dimensiones,
        cols_to_drop=[col for col in list(dataframe.columns) if col[0] == 'V']
    )
    
    df_v = get_stacked_dataframe(
        dataframe,
        cols_remain=cols_dimensiones,
        cols_to_drop=[col for col in list(dataframe.columns) if col[0] == 'H']
    )
    
    print('Joining dataframes')
    final_df = add_last_col_to_df(df_h, df_v)
    
    print('Renaming columns')
    final_df = final_df.rename(columns={'level_7': 'HORA', 0: 'NIVEL_NO2', 'new_col': 'VALIDADO'})
    
    print('Formatting HORA column')
    final_df['HORA'] = final_df['HORA'].apply(lambda x: int(x[-2:]))
    
    print('Sorting dataframe by year, month and day')
    
    print(f'Final dataframe shape: {final_df.shape}')
    
    return final_df

In [11]:
reshaped_df = get_reshaped_df(data)

Dataframe shape: (4490, 56)
Keeping just NO2 data
Dataframe shape: (719, 55)
Adding missing days rows
24 sample spots
Checking sample spot: 28079036_8_8
Checking sample spot: 28079060_8_8
Checking sample spot: 28079035_8_8
Checking sample spot: 28079049_8_8
Checking sample spot: 28079057_8_8
Checking sample spot: 28079016_8_8
Checking sample spot: 28079008_8_8
Checking sample spot: 28079050_8_8
Checking sample spot: 28079018_8_8
Day 18-4-2018 missing
[150, 28, 79, 18, '28079018_8_8', 2018, 4, 1, 18, 'N', 13.0, 'N', 13.0, 'N', 29.0, 'N', 40.0, 'N', 39.0, 'N', 48.0, 'N', 49.0, 'N', 34.0, 'N', 36.0, 'N', 33.0, 'N', 21.0, 'N', 17.0, 'N', 16.0, 'N', 14.0, 'N', 10.0, 'N', 11.0, 'N', 11.0, 'N', 15.0, 'N', 24.0, 'N', 57.0, 'N', 73.0, 'N', 74.0, 'N', 104.0, 'N']
Day 18-4-2018 row added to original dataframe
Checking sample spot: 28079027_8_8
Checking sample spot: 28079024_8_8
Checking sample spot: 28079040_8_8
Checking sample spot: 28079004_8_8
Checking sample spot: 28079059_8_8
Checking sample

In [25]:
reshaped_df.head()

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,PUNTO_MUESTREO,ANO,MES,DIA,HORA,NIVEL_NO2,VALIDADO
0,28,79,4,28079004_8_8,2018,4,1,1,21.0,V
1,28,79,4,28079004_8_8,2018,4,1,2,19.0,V
2,28,79,4,28079004_8_8,2018,4,1,3,17.0,V
3,28,79,4,28079004_8_8,2018,4,1,4,24.0,V
4,28,79,4,28079004_8_8,2018,4,1,5,17.0,V


In [13]:
# Ideas: para rellenar los valores que faltan
# comprobar si faltan datos de días e incluirlos
# para valores no validados:
    # media del nivel de NO2 de ese punto de muestreo en cada hora
    # append de todos los df, ordenar por pm, año, mes y hora y rellenar con la media de los valores contíguos

In [20]:
reshaped_df['VALIDADO'].value_counts()

V    17186
N       94
Name: VALIDADO, dtype: int64

In [28]:
# We create a list with all the sample spots
sample_spots_list = list(set(reshaped_df['PUNTO_MUESTREO']))
sample_1 = '28079004_8_8'
sample_df = reshaped_df[reshaped_df['PUNTO_MUESTREO'] == '28079004_8_8']
sample_df = sample_df.sort_values(by=['DIA', 'HORA'])
sample_df.shape

(720, 10)

In [29]:
sample_df['VALIDADO'].value_counts()

V    719
N      1
Name: VALIDADO, dtype: int64

In [47]:
sample_df_y = sample_df[sample_df['VALIDADO'] == 'Y']
sample_df_n = sample_df[sample_df['VALIDADO'] == 'N']
max_index = max(sample_df.index)
min_index = min(sample_df.index)
max_index

719

In [33]:
sample_df_n

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,PUNTO_MUESTREO,ANO,MES,DIA,HORA,NIVEL_NO2,VALIDADO
203,28,79,4,28079004_8_8,2018,4,9,12,57.0,N


In [42]:
def get_next_validated_value(dataframe, index, get_next=True):
    iterator = 0
    next_validated = 'N'
    
    while next_validated == 'N':
        iterator += 1
        if get_next:
            next_validated = dataframe.loc[index + iterator, 'VALIDADO']
            wanted_index = index + iterator
        else:
            next_validated = dataframe.loc[index - iterator, 'VALIDADO']
            wanted_index = index - iterator

    return wanted_index

In [48]:
for index, row in sample_df_n.iterrows():
    
    if index == min_index:
        next_validated_index = get_next_validated_value(sample_df, index)
        sample_df.loc[index, 'NIVEL_NO2'] = sample_df.loc[next_validated_index, 'NIVEL_NO2']
        sample_df.loc[index, 'VALIDADO'] = 'R'
        
    elif index == max_index:
        next_validated_index = get_next_validated_value(sample_df, index, get_next=False)
        sample_df.loc[index, 'NIVEL_NO2'] = sample_df.loc[next_validated_index, 'NIVEL_NO2']
        sample_df.loc[index, 'VALIDADO'] = 'R'
    
    else:
        next_validated_index = get_next_validated_value(sample_df, index)
        previous_validated_index = get_next_validated_value(sample_df, index, get_next=False)
        sample_df.loc[index, 'NIVEL_NO2'] = (
            sample_df.loc[next_validated_index, 'NIVEL_NO2'] + sample_df.loc[previous_validated_index, 'NIVEL_NO2']
        ) / 2
        sample_df.loc[index, 'VALIDADO'] = 'R'


In [49]:
get_next_validated_value(sample_df, 203, get_next=True)

204

In [50]:
get_next_validated_value(sample_df, 203, get_next=False)

202

In [51]:
sample_df[200:210]

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,PUNTO_MUESTREO,ANO,MES,DIA,HORA,NIVEL_NO2,VALIDADO
200,28,79,4,28079004_8_8,2018,4,9,9,56.0,V
201,28,79,4,28079004_8_8,2018,4,9,10,56.0,V
202,28,79,4,28079004_8_8,2018,4,9,11,48.0,V
203,28,79,4,28079004_8_8,2018,4,9,12,45.5,R
204,28,79,4,28079004_8_8,2018,4,9,13,43.0,V
205,28,79,4,28079004_8_8,2018,4,9,14,29.0,V
206,28,79,4,28079004_8_8,2018,4,9,15,26.0,V
207,28,79,4,28079004_8_8,2018,4,9,16,23.0,V
208,28,79,4,28079004_8_8,2018,4,9,17,21.0,V
209,28,79,4,28079004_8_8,2018,4,9,18,25.0,V
