In [1]:
# Import the required modules
from pyesgf.search import SearchConnection
import os
import sys
import importlib
import pandas as pd
import requests
from tqdm import tqdm

# Set the os environment to on
os.environ['ESGF_PYCLIENT_NO_FACETS_STAR_WARNING'] = "on"

In [2]:
# Import the functions
sys.path.append('/home/users/benhutch/downloading-data/DAMIP/')

# Import the functions
from testing_download_functions import *

In [3]:
# Import the dictionaries
import dictionaries as dicts

In [4]:
# Import the functions again, so as to not have to restart the kernel
importlib.reload(sys.modules['testing_download_functions'])

# Import the functions again
from testing_download_functions import find_valid_nodes, create_results_list, \
                                        extract_file_context, check_file_exists_jasmin

# Import the dictionaries again, so as to not have to restart the kernel
importlib.reload(sys.modules['dictionaries'])

# Import the dictionaries again
import dictionaries as dicts

In [5]:
# Set up the search connection
connection = SearchConnection(dicts.search_connection, distrib=True)

In [6]:
# Set up the paramas
# To speed up the check, only search for year s1961
params = {
    'activity_id': 'DCPP',
    'experiment_id': 'dcppA-hindcast',
    'latest': True,
    'project': 'CMIP6',
    'table_id': 'Amon',
}

In [7]:
# Loop over the variables
# variables = dicts.variables
models = ["CMCC-CM2-SR5"]

# Set up the test variable - 'pr' precip
variables = ['rsds']

# Create a dataframe for the results
# Containing three columns: model, experiment, does_model_exist
results_df = pd.DataFrame(columns=['model', 'variable', 'does_model_exist'])

# Loop over the variables
for variable in variables:
    print("Checking whether models exist for variable: " + variable)

    # Loop over the models
    for model in models:
        print("Checking whether model: " + model + " exists for variable: " + variable)

        # Set up the params
        params['variable_id'] = variable
        params['source_id'] = model

        # Query the database
        ctx = connection.new_context(**params)

        try:
            # Get the results from the query
            results = ctx.search()
        except:
            print("Model: " + model + " does not exist for variable: " + variable)
            results = []

        # If the length of results is greater than 0
        if len(results) > 0:
            print("Model: " + model + " exists for variable: " + variable)

            # Append True to the dataframe
            results_df = pd.concat([results_df, pd.DataFrame({'model': [model], 'variable': [variable], 'does_model_exist': [True]})], ignore_index=True)
        else:
            print("Model: " + model + " does not exist for variable: " + variable)

            # Append False to the dataframe
            results_df = pd.concat([results_df, pd.DataFrame({'model': [model], 'variable': [variable], 'does_model_exist': [False]})], ignore_index=True)

# Print the dataframe
results_df

Checking whether models exist for variable: rsds
Checking whether model: CMCC-CM2-SR5 exists for variable: rsds


Model: CMCC-CM2-SR5 exists for variable: rsds


Unnamed: 0,model,variable,does_model_exist
0,CMCC-CM2-SR5,rsds,True


In [8]:
# Create an empty dictionary to store the valid nodes for each variable
variable_nodes = {}


# Find the valid nodes for each variable and models list combination
# Loop over the var models dictionary
for var, models_list in dicts.var_models_test_rsds.items():
    print("Finding the valid nodes for variable: " + var)
    print("Models list: " + str(models_list))

    # Set up the variable_id for the params
    params['variable_id'] = var

    # Append the variable to the dictionary
    variable_nodes[var] = {}

    # Find the valid nodes
    valid_nodes = find_valid_nodes(params=params, 
                                   models_list=models_list,
                                   conn=connection)
    
    # Append the valid nodes to the dictionary
    variable_nodes[var] = valid_nodes


Finding the valid nodes for variable: rsds
Models list: ['CMCC-CM2-SR5']
trying to find valid nodes for model: CMCC-CM2-SR5
{'activity_id': 'DCPP', 'experiment_id': 'dcppA-hindcast', 'latest': True, 'project': 'CMIP6', 'table_id': 'Amon', 'variable_id': 'rsds', 'source_id': 'CMCC-CM2-SR5'}
2720
{'esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf-node2.cmcc.it'}
trying to find valid files for node: esgf-data1.llnl.gov
1200
trying to find valid files for node: esgf.ceda.ac.uk
320
trying to find valid files for node: esgf-node2.cmcc.it
1200
[{'source_id': 'CMCC-CM2-SR5', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 1200}]


In [9]:
import importlib

importlib.reload(sys.modules['testing_download_functions'])

# Import the functions again
from testing_download_functions import find_valid_nodes, create_results_list, \
                                        extract_file_context, check_file_exists_jasmin

In [10]:
# For each variable, create the results list
# Initialize an empty dictionary to store the results
results = {}

# Loop over the variables
for var in ['rsds']:
    print("Finding the results for variable: " + var)

    # Extract the valid nodes for the variable
    valid_nodes = variable_nodes[var]

    # Append the variable to params
    params['variable_id'] = var

    # print the type of valid nodes
    print("Valid nodes type: " + str(type(valid_nodes)))
    print("Valid nodes: " + str(valid_nodes))

    # Find the results for the variable
    var_results = create_results_list(params=params,
                                      max_results_list=valid_nodes,
                                      connection=connection)

    # Append the results to the dictionary
    results[var] = var_results

Finding the results for variable: rsds
Valid nodes type: <class 'list'>
Valid nodes: [{'source_id': 'CMCC-CM2-SR5', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 1200}]


  0%|          | 0/1 [00:00<?, ?it/s]

Querying for source_id: CMCC-CM2-SR5 and data_node: esgf-data1.llnl.gov


100%|██████████| 1/1 [00:02<00:00,  2.25s/it]

Found 1200 results.





In [12]:
print("Results: " + str(results))
# print the length of the results
print("Length of results: " + str(len(results)))
print(type(results['rsds']))

Results: {'rsds': [<pyesgf.search.results.ResultSet object at 0x7fe423cb11e0>]}
Length of results: 1
<class 'list'>


In [13]:
# Initialize an empty dictionary to store the results file context
results_file_context = {}

# For each of the variables
for var, results in results.items():
    print("Variable: " + var)
    print("Results: " + str(results))

    # Initialize an empty dictionary to store the file context
    file_context_list = []

    # Loop over the results
    for result in results:
        print("Result: " + str(result))

        # Extract the file context
        file_context = extract_file_context(result)

        # Append the file context to the list
        file_context_list.append(file_context)

    # Append the file context list to the dictionary
    results_file_context[var] = file_context_list

Variable: rsds
Results: [<pyesgf.search.results.ResultSet object at 0x7fe423cb11e0>]
Result: <pyesgf.search.results.ResultSet object at 0x7fe423cb11e0>
Extracting file context for 1200 datasets...


100%|██████████| 1200/1200 [35:14<00:00,  1.76s/it]


In [None]:
# Create an empty dataframe to store the results
df = pd.DataFrame()

for var, file_context_lists in results_file_context.items():
    print("Variable: " + var)

    # Loop over the file contexts
    for file_context in file_context_lists:
        # Convert the dictionary to a dataframe
        file_context_df = pd.DataFrame.from_dict(file_context)

        # Add a new column on the far left of the dataframe
        # containing the variable name
        file_context_df.insert(0, 'variable', var)

        # Concatenate the dataframe to the results dataframe
        df = pd.concat([df, file_context_df], ignore_index=True)

# Print the dataframe
df

In [None]:
# Extract the dictionaries in the first row of the dataframe into a new dataframe
# containing only the first row, with headers 'filename' and 'url'
# {'filename': 'pr_Amon_MIROC6_dcppA-hindcast_s1992-r2i1p1f1_gn_199211-200212.nc', 'url': 'http://esgf-data02.diasjp.net/thredds/fileServer/esg_dataroot/CMIP6/DCPP/MIROC/MIROC6/dcppA-hindcast/s1992-r2i1p1f1/Amon/pr/gn/v20190821/pr_Amon_MIROC6_dcppA-hindcast_s1992-r2i1p1f1_gn_199211-200212.nc'}

In [None]:
# create a copy of the dataframe
df_copy = df.copy()

# Extract the first row of the dataframe
first_row = df_copy.iloc[0]

# Print the first row
type(first_row)

# Convert the first row to a dataframe
first_row_df = pd.DataFrame(first_row).T

# # Print the first row dataframe
# first_row_df

# Remove the 'variable' column from the first row dataframe
first_row_df = first_row_df.drop(columns='variable')

# Remove the file_exists and filepaths columns from the first row dataframe
# first_row_df = first_row_df.drop(columns='file_exists')

# first_row_df = first_row_df.drop(columns='filepath')

first_row_df

# Set up a list of filenames
filenames = [ ]
urls = [ ]

# Loop over the columns in the first row
for column in first_row_df.columns:
    # print("Column: " + column)

    # If the cell contains a dictionary
    if type(first_row_df[column].values[0]) == dict:
        print("Cell is a dictionary")

        # Extract the filename and url from the dictionary
        filename = first_row_df[column].values[0]['filename']
        url = first_row_df[column].values[0]['url']

        # Append the filename and url to the lists
        filenames.append(filename)
        urls.append(url)

# # Loop over the columns in the first row
# for column in first_row.columns:
#     print("Column: " + column)

#     # If the cell contains a dictionary
#     if type(first_row[column]) == dict:
#         print("Cell is a dictionary")

#         # Extract the filename and url from the dictionary
#         filename = first_row[column]['filename']
#         url = first_row[column]['url']

#         # Append the filename and url to the lists
#         filenames.append(filename)
#         urls.append(url)

In [None]:
# Set up a new dataframe containing the filenames and urls
df_filenames_urls = pd.DataFrame({'filename': filenames, 'url': urls})

# Print the dataframe
df_filenames_urls

In [None]:
# Set up the directory for the data on JASMIN
dcpp_dir_badc = "/badc/cmip6/data/CMIP6/DCPP/"

# Checkk whether these files exist on JASMIN
jasmin_files_df = check_file_exists_jasmin(df=df_filenames_urls,
                                            directory=dcpp_dir_badc)

In [None]:
# Now set up the group work space directory
dcpp_dir_gws = "/gws/nopw/j04/canari/users/benhutch/"

# Check whether these files exist on JAASMIN
jasmin_files_df_gws = check_file_exists_jasmin(df=df_filenames_urls,
                                            directory=dcpp_dir_gws) 

In [None]:
jasmin_files_df_gws

In [None]:
# if file exist = True, check the size of the file and append this 'file_size'
# to the dataframe
# Set up an empty list to store the file sizes
file_sizes = []

# Loop over the rows in the dataframe
for index, row in jasmin_files_df_gws.iterrows():
    print("Index: " + str(index))
    print("Row: " + str(row))

    # If the file exists
    if row['file_exists'] == True:
        print("File exists")

        # Set up the file path
        file_path = row['filepath']

        # Get the size of the file
        file_size = os.path.getsize(file_path)

        # Append the file size to the list
        file_sizes.append(file_size)
    else:
        print("File does not exist")

        # Append None to the list
        file_sizes.append(None)

In [None]:
# /gws/nopw/j04/canari/users/benhutch/pr_Amon_MIROC6_dcppA-hindcast_s1991-r8i1p1f1_gn_199111-200112.nc - doesn't exist
# /gws/nopw/j04/canari/users/benhutch/dcppA-hindcast/data/pr/MIROC6/pr_Amon_MIROC6_dcppA-hindcast_s1991-r8i1p1f1_gn_199111-200112.nc
# /gws/nopw/j04/canari/users/benhutch/dcppA-hindcast/data/pr/MIROC6/pr_Amon_MIROC6_dcppA-hindcast_s1991-r8i1p1f1_gn_199111-200112.nc

In [None]:
jasmin_files_df_gws['file_size'] = file_sizes

In [None]:
jasmin_files_df_gws

In [None]:
# Is the filesize is less than 1,000,000 bytes, then the file is too small
# Remove the filepath value for that row
# and set the file_exists value to False
# Loop over the rows in the dataframe
for index, row in jasmin_files_df_gws.iterrows():
    print("Index: " + str(index))
    print("Row: " + str(row))

    # If the file size is less than 1,000,000 bytes
    if row['file_size'] < 1000000:
        print("File size is less than 1,000,000 bytes")

        # Set the file_exists value to False
        jasmin_files_df_gws.at[index, 'file_exists'] = False

        # Set the filepath value to None
        jasmin_files_df_gws.at[index, 'filepath'] = None

In [None]:
jasmin_files_df_gws

In [None]:
importlib.reload(sys.modules['testing_download_functions'])

# Import the functions again
from testing_download_functions import download_files

In [None]:
# Test the download_files function
# TODO: Reduce the number of print statements used in this
download_files(download_dir=dicts.dcpp_dir_gws,
               df=jasmin_files_df_gws
)

In [6]:
# Now we want to verify that the files have been downloaded
# Although some of them are in /gws/nopw/j04/canari/users/benhutch + dcppA-hindcast/pr/MIROC6/
# While others are in /gws/nopw/j04/canari/users/benhutch + dcppA-hindcast/data/pr/MIROC6/

# We are going to use the /gws/nopw/j04/canari/users/benhutch + dcppA-hindcast/pr/MIROC6/
# as the main data store
# But first we need to move the files from /gws/nopw/j04/canari/users/benhutch + dcppA-hindcast/data/pr/MIROC6/
# to /gws/nopw/j04/canari/users/benhutch + dcppA-hindcast/pr/MIROC6/
# BUt first we want to remove the files in /gws/nopw/j04/canari/users/benhutch + dcppA-hindcast/data/pr/MIROC6/
# Which are less than 1,000,000 bytes
old_dir = "/gws/nopw/j04/canari/users/benhutch/dcppA-hindcast/data/sfcWind/CMCC-CM2-SR5/"

# Set up an empty list for the filenames
filenames = []
file_sizes = []

# In the old directory, check the file sizes
# Loop over the files in the directory
for file in os.listdir(old_dir):
    print("File: " + file)

    # Set up the file path
    file_path = old_dir + file

    # Get the size of the file
    file_size = os.path.getsize(file_path)

    # Append the file size to the list
    file_sizes.append(file_size)

    # Append the filename to the list
    filenames.append(file)

# Set up a dataframe containing the filenames and file sizes
old_dir_df = pd.DataFrame({'filename': filenames, 'file_size': file_sizes})

File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r10i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r11i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r12i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r13i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r14i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r15i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r16i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r17i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r18i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r19i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r1i1p1f1_gn_196011-197012.nc
File: sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960-r20i1p1f1_gn_196011-197012.nc
File:

In [7]:
# Create a new column in the dataframe
# a boolen called 'remove_file'
# which is True if the file size is less than 1,000,000 bytes
old_dir_df['remove_file'] = old_dir_df['file_size'] < 1000000

In [8]:
old_dir_df

Unnamed: 0,filename,file_size,remove_file
0,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...,0,True
1,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...,0,True
2,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...,0,True
3,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...,0,True
4,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...,0,True
...,...,...,...
752,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1997...,0,True
753,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1997...,0,True
754,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1997...,0,True
755,sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1997...,0,True


In [9]:
# Remove the files from the old directory which are less than 1,000,000 bytes
# Loop over the rows in the dataframe
for index, row in old_dir_df.iterrows():
    print("Index: " + str(index))
    print("Row: " + str(row))

    # If the file size is less than 1,000,000 bytes
    if row['remove_file'] == True:
        print("File size is less than 1,000,000 bytes")

        # Set up the file path
        file_path = old_dir + row['filename']

        # Remove the file
        os.remove(file_path)

Index: 0
Row: filename       sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...
file_size                                                      0
remove_file                                                 True
Name: 0, dtype: object
File size is less than 1,000,000 bytes
Index: 1
Row: filename       sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...
file_size                                                      0
remove_file                                                 True
Name: 1, dtype: object
File size is less than 1,000,000 bytes
Index: 2
Row: filename       sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...
file_size                                                      0
remove_file                                                 True
Name: 2, dtype: object
File size is less than 1,000,000 bytes
Index: 3
Row: filename       sfcWind_Amon_CMCC-CM2-SR5_dcppA-hindcast_s1960...
file_size                                                      0
remove_file                                

In [10]:
# MOve the files from the old directory to the new directory
new_dir = "/gws/nopw/j04/canari/users/benhutch/dcppA-hindcast/pr/MIROC6/"

# Loop over the files in the old directory
for file in os.listdir(old_dir):
    print("File: " + file)

    # Set up the old file path
    old_file_path = old_dir + file

    # Set up the new file path
    new_file_path = new_dir + file

    # Move the file
    os.rename(old_file_path, new_file_path)

In [11]:
# Now create a new dataframe containing the filenames and file sizes
new_dir_filenames = []
new_dir_file_sizes = []

# Loop over the files in the new directory
for file in os.listdir(new_dir):
    print("File: " + file)

    # Set up the file path
    file_path = new_dir + file

    # Get the size of the file
    file_size = os.path.getsize(file_path)

    # Append the file size to the list
    new_dir_file_sizes.append(file_size)

    # Append the filename to the list
    new_dir_filenames.append(file)

# Set up a dataframe containing the filenames and file sizes
new_dir_df = pd.DataFrame({'filename': new_dir_filenames, 'file_size': new_dir_file_sizes})

File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r10i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r1i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r2i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r3i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r4i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r5i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r6i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r7i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r8i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1960-r9i1p1f1_gn_196011-197012.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1961-r10i1p1f1_gn_196111-197112.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1961-r1i1p1f1_gn_196111-197112.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1961-r2i1p1f1_gn_196111-197112.nc
File: pr_Amon_MIROC6_dcppA-hindcast_s1961-r3i1p1f1_gn_196111-197112.nc
File

In [12]:
new_dir_df

Unnamed: 0,filename,file_size
0,pr_Amon_MIROC6_dcppA-hindcast_s1960-r10i1p1f1_...,12525328
1,pr_Amon_MIROC6_dcppA-hindcast_s1960-r1i1p1f1_g...,12532444
2,pr_Amon_MIROC6_dcppA-hindcast_s1960-r2i1p1f1_g...,12531685
3,pr_Amon_MIROC6_dcppA-hindcast_s1960-r3i1p1f1_g...,12528156
4,pr_Amon_MIROC6_dcppA-hindcast_s1960-r4i1p1f1_g...,12525625
...,...,...
615,pr_Amon_MIROC6_dcppA-hindcast_s2021-r5i1p1f1_g...,12521820
616,pr_Amon_MIROC6_dcppA-hindcast_s2021-r6i1p1f1_g...,12526603
617,pr_Amon_MIROC6_dcppA-hindcast_s2021-r7i1p1f1_g...,12528688
618,pr_Amon_MIROC6_dcppA-hindcast_s2021-r8i1p1f1_g...,12521252


In [13]:
# Verify that there are 10 files for available for each year s{year}
# from 1960 to 2021
# Loop over the years
for year in range(1960, 2022):
    print("Year: " + str(year))

    # find all of the rows with filenames containing s{year}
    # and print the length of the dataframe
    print("Length of dataframe: " + str(len(new_dir_df[new_dir_df['filename'].str.contains("s" + str(year))])))

    # Assert that the length of the dataframe is 10
    assert len(new_dir_df[new_dir_df['filename'].str.contains("s" + str(year))]) == 10

Year: 1960
Length of dataframe: 10
Year: 1961
Length of dataframe: 10
Year: 1962
Length of dataframe: 10
Year: 1963
Length of dataframe: 10
Year: 1964
Length of dataframe: 10
Year: 1965
Length of dataframe: 10
Year: 1966
Length of dataframe: 10
Year: 1967
Length of dataframe: 10
Year: 1968
Length of dataframe: 10
Year: 1969
Length of dataframe: 10
Year: 1970
Length of dataframe: 10
Year: 1971
Length of dataframe: 10
Year: 1972
Length of dataframe: 10
Year: 1973
Length of dataframe: 10
Year: 1974
Length of dataframe: 10
Year: 1975
Length of dataframe: 10
Year: 1976
Length of dataframe: 10
Year: 1977
Length of dataframe: 10
Year: 1978
Length of dataframe: 10
Year: 1979
Length of dataframe: 10
Year: 1980
Length of dataframe: 10
Year: 1981
Length of dataframe: 10
Year: 1982
Length of dataframe: 10
Year: 1983
Length of dataframe: 10
Year: 1984
Length of dataframe: 10
Year: 1985
Length of dataframe: 10
Year: 1986
Length of dataframe: 10
Year: 1987
Length of dataframe: 10
Year: 1988
Length of