In [1]:
cd '/tmp'

/tmp


In [2]:
!pip install madrigalWeb h5py tables



In [3]:
import os
import subprocess
import h5py
import pandas as pd
from tqdm import tqdm

In [None]:
def collecter_annee(annee):
    command = [
        "globalDownload.py",
        "--verbose",
        "--url=http://cedar.openmadrigal.org",
        f"--outputDir=/tmp/{annee}",
        "--user_fullname=Prénom+Nom",
        "--user_email=exemple@mail.fr",
        "--user_affiliation=None",
        "--format=hdf5",
        f"--startDate=01/01/{annee}",
        f"--endDate=12/31/{annee}",
        "--inst=8000",
        "--kindat=3500"
    ]
    
    subprocess.run(command, check=True)

In [5]:
def collecter_fichiers(annee):
    filenames = []
    folder_path = f"/tmp/{annee}"
    for filename in os.listdir(folder_path):
        if filename[3:5] == str(annee)[2:4]:
            filenames.append(filename)
    filenames = sorted(filenames)
    print(annee, len(filenames))
    return filenames

In [6]:
def separer_par_mois(filenames):
    filenames_par_mois = []
    idx = 0
    for mois in range(1, 13):
        current = []
        for ix, filename in enumerate(filenames[idx:]):
            if filename[5:7] == str(mois).zfill(2):
                current.append(filename)
            else:
                idx += ix
                filenames_par_mois.append(current)
                break
    filenames_par_mois.append(current) # last month
    return filenames_par_mois

In [7]:
def creer_dataframe_mois(annee, filenames_du_mois):
    dfs_mois = []
    for filename in filenames_du_mois:
        with h5py.File(f'/tmp/{annee}/{filename}', 'r') as file:
            current = pd.DataFrame(file['Data/Table Layout'][:])[['ut1_unix', 'ut2_unix', 'gdlat', 'glon', 'tec']]
            current['time'] = 0.5 * (current['ut1_unix'] + current['ut2_unix'])
            current = current[['time', 'gdlat', 'glon', 'tec']]
            dfs_mois.append(current)
    return pd.concat(dfs_mois)

In [None]:
annees = list(range(2001, 2017))
for annee in tqdm(annees):
    # télécharger l'année dans le bon dossier
    collecter_annee(annee)
    # collecter les noms des fichiers et les mettre dans l'ordre, vérifier qu'il n'y a pas d'anomalie (365 et 366 les années bisextiles)
    filenames = collecter_fichiers(annee)
    filenames_par_mois = separer_par_mois(filenames)
    for mois, filenames_du_mois in tqdm(enumerate(filenames_par_mois)):
        mois = str(mois+1).zfill(2)
        # créer un df par mois et convertir chaque mois au format parquet (il faut viser une taille <= 100 Mo)
        df_mois = creer_dataframe_mois(annee, filenames_du_mois)
        # nécessaire de faire des chunks pour rester en dessous des 100 Mo
        chunk_size = 12500000
        for i in range(0, len(df_mois), chunk_size):
            chunk = df_mois.iloc[i:i+chunk_size]
            chunk.to_parquet(f'mva-projet-series-temp/data/tec_{annee}_{mois}_chunk_{i//chunk_size}.parquet.gzip', compression='gzip', index=False)
    # supprimer les fichiers téléchargés pour laisser de la place pour le mois suivant
    for filename in os.listdir(f"/tmp/{annee}"):
        file_path = os.path.join(f"/tmp/{annee}", filename)
        os.unlink(file_path)
    os.rmdir(f"/tmp/{annee}")
    

  0%|          | 0/5 [00:00<?, ?it/s]

Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2011/gps/31dec11
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/01jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/02jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/03jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/04jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/05jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/06jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/07jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/08jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/09jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/10jan12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/

12it [13:15, 66.31s/it]
 20%|██        | 1/5 [1:11:42<4:46:50, 4302.64s/it]

Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2012/gps/31dec12
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/01jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/02jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/03jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/04jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/05jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/06jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/07jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/08jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/09jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/10jan13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/

12it [18:28, 92.37s/it]
 40%|████      | 2/5 [2:32:34<3:51:16, 4625.53s/it]

Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments3/2013/gps/31dec13
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/01jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/02jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/03jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/04jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/05jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/06jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/07jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/08jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/09jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/10jan14
Analyzed exp url http://cedar.openmadrigal.org/madtoc/experiments4/2014/gps/