API download (Use Final product)

In [1]:
import os
import time
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Define the download directory
download_path = r"C:\Users\userAdmin\Desktop\Pipeline\Data files\IMERG temporary\2022"
os.makedirs(download_path, exist_ok=True)

# Read the URLs from the text file
# Remove first 2 rows of the file!!!!
urls_file = r"c:\Users\userAdmin\Downloads\subset_GPM_3IMERGHH_07_20240621_062121_.txt"
try:
    df = pd.read_csv(urls_file, header=None, delimiter="\t", on_bad_lines='warn')
    urls = df[0].tolist()
except Exception as e:
    print(f"Error reading URLs file: {e}")
    urls = []

# Function to get filenames from URLs
def get_filename_from_url(url):
    date_time_index = url.find("IMERG.")
    if date_time_index != -1:
        date_time_part = url[date_time_index+6:date_time_index+15] + url[date_time_index+16:date_time_index+20]
        return date_time_part + ".nc"
    else:
        return "unknown_date_time.nc"

# Create a mapping of URLs to filenames
url_to_filename = {url: get_filename_from_url(url) for url in urls}

# Function to download a file
def download_file(session, url, filename):
    filepath = os.path.join(download_path, filename)
    attempts = 0
    max_attempts = 5
    backoff_time = 1  # start with 1 second

    while attempts < max_attempts:
        try:
            res = session.get(url)
            res.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(res.content)
            return f"{filename} downloaded successfully."
        except requests.exceptions.RequestException as e:
            attempts += 1
            if attempts < max_attempts:
                time.sleep(backoff_time)
                backoff_time *= 2  # exponential backoff
            else:
                return f"Failed to download {filename} after {attempts} attempts: {e}"
        except Exception as e:
            return f"Error saving {filename}: {e}"

# Set up the session with retries
session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Download the files initially
def download_files(url_to_filename):
    with ThreadPoolExecutor(max_workers=100) as executor:
        future_to_url = {executor.submit(download_file, session, url, filename): url for url, filename in url_to_filename.items()}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                print(result)
            except Exception as e:
                print(f"Error downloading {url}: {e}")

download_files(url_to_filename)

# Check for missing files and redownload if necessary
def check_and_redownload(url_to_filename):
    # List the actual downloaded files
    actual_files = os.listdir(download_path)
    
    # Identify missing files
    missing_files = [filename for filename in url_to_filename.values() if filename not in actual_files]
    
    # If there are missing files, reconstruct their URLs and redownload them
    if missing_files:
        print("Retrying missing files...")
        missing_url_to_filename = {url: filename for url, filename in url_to_filename.items() if filename in missing_files}
        download_files(missing_url_to_filename)
    
    # Final check to ensure all files are downloaded
    actual_files = os.listdir(download_path)
    still_missing = [filename for filename in url_to_filename.values() if filename not in actual_files]
    
    if still_missing:
        print("Some files are still missing:")
        for filename in still_missing:
            print(filename)
    else:
        print("All files downloaded successfully.")

check_and_redownload(url_to_filename)


20220103-0100.nc downloaded successfully.
20220103-0000.nc downloaded successfully.
20220103-0130.nc downloaded successfully.
20220103-0030.nc downloaded successfully.
20220102-2300.nc downloaded successfully.
20220102-0400.nc downloaded successfully.
20220102-1800.nc downloaded successfully.
20220102-2330.nc downloaded successfully.
20220102-2100.nc downloaded successfully.
20220102-2130.nc downloaded successfully.
20220103-0330.nc downloaded successfully.
20220103-0400.nc downloaded successfully.
20220103-0300.nc downloaded successfully.
20220103-0230.nc downloaded successfully.
20220103-0200.nc downloaded successfully.
20220103-0430.nc downloaded successfully.
20220103-0500.nc downloaded successfully.
20220103-0530.nc downloaded successfully.
20220103-0600.nc downloaded successfully.
20220103-0630.nc downloaded successfully.
20220103-0700.nc downloaded successfully.
20220103-0730.nc downloaded successfully.
20220103-0930.nc downloaded successfully.
20220103-1000.nc downloaded succes

Merge within specific time period

In [2]:
import os
import pandas as pd
import xarray as xr

def find_files_in_range(folder_path, start_date, end_date):
    all_files = os.listdir(folder_path)
    files_in_range = []

    for file in all_files:
        if file.endswith(".nc"):
            file_date_str = file.split("-")[0]  # Extract the date-time part
            try:
                file_date = pd.to_datetime(file_date_str, format='%Y%m%d')
                if start_date <= file_date <= end_date:
                    files_in_range.append(os.path.join(folder_path, file))
            except ValueError:
                print(f"Skipping file with invalid date format: {file}")
    print(f"Found {len(files_in_range)} files in the specified date range.")
    return sorted(files_in_range)

def combine_nc_files(files):
    datasets = []
    for file in files:
        try:
            print(f"Attempting to open file: {file}")
            ds = xr.open_dataset(file, engine='netcdf4')
            datasets.append(ds)
        except Exception as e:
            print(f"Error opening {file}: {e}")
    if not datasets:
        raise ValueError("No valid datasets to combine.")
    combined_dataset = xr.concat(datasets, dim='time')
    return combined_dataset

def main():
    folder_path = r'c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files'
    start_date = pd.to_datetime('2022-01-01 00:00')
    end_date = pd.to_datetime('2022-12-31 23:59')

    files_in_range = find_files_in_range(folder_path, start_date, end_date)
    if not files_in_range:
        print("No files found in the specified date range.")
        return

    try:
        combined_dataset = combine_nc_files(files_in_range)
        output_file = r'C:\Users\userAdmin\Desktop\Pipeline\Data files\IMERG_2022.nc'
        combined_dataset.to_netcdf(output_file)
        print(f"Combined dataset saved to {output_file}")
    except ValueError as e:
        print(e)

if __name__ == "__main__":
    main()



Found 17520 files in the specified date range.
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0000.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0030.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0100.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0130.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0200.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0230.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0300.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data files\20220101-0330.nc
Attempting to open file: c:\Users\userAdmin\Desktop\Rain data files\IMERG 10yrs\data file