# Download Subdirectories From FTP II

#### Updated: Dec 8, 2022

#  

Run for continuous downloading of subdirectories, concurrent with processing of subdirectories.

In [1]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime
import ftplib
from random import randrange

Set parameters:

In [2]:
user = 'Rav'
process_on_VDI = True
wait = 5 #in minutes
stop_loop_threshold = 2640 #max while loops to prevent infinite loop

Set directories:

In [3]:
rootDir_local = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/' #files on C:/ are not persistent on VDI
downloadingDir = rootDir_local + '01_downloading/'
downloadedDir = rootDir_local + '02_downloaded/'
if process_on_VDI:
    logDir = '//scientific/L-MP-Data/Massive files/Python/rnaidoo/Alouette_I/' #DO NOT CHANGE
else:
    logDir = rootDir_local + '05_result/'

#  

#### Functions:

In [4]:
def move_images(old_dir, new_dir, roll, subdir, copy_to_other_drive=False):
    oldDir = old_dir + roll + '/' + subdir + '/'
    newDir = new_dir + roll + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    
    if copy_to_other_drive:
        for file in os.listdir(oldDir):
            shutil.copyfile(oldDir+file, newDir+file)
    else:
        for file in os.listdir(oldDir):
            os.rename(oldDir+file, newDir+file)
    
    shutil.rmtree(old_dir + roll + '/' + subdir + '/')
    if len(os.listdir(old_dir + roll + '/')) == 0:
        shutil.rmtree(old_dir + roll + '/')

In [5]:
def draw_random_subdir(roll, subdir_list, logDir):
    
    subdirectory = subdir_list[randrange(len(subdir_list))]
    
    #Check randomly-selected roll and subdirectory against the 'download_log'
    if os.path.exists(logDir + 'download_log.csv'):
        df_log = pd.read_csv(logDir + 'download_log.csv')
        df_search = df_log.loc[(df_log['Roll'] == roll) & (df_log['Subdirectory'] == subdirectory)]
        if len(df_search) > 0:
            print(roll + '/' + subdirectory + ' already downloaded!')
            return ''
        else:
            return subdirectory
    else:
        return subdirectory

In [6]:
def draw_random_subdir2(subdir_ids_list, logDir):
    
    subdir_id = subdir_ids_list[randrange(len(subdir_ids_list))]
    subdir_id_parts = subdir_id.split('_')
    roll = subdir_id_parts[0]
    subdirectory = subdir_id_parts[1]
    
    #Check randomly-selected roll and subdirectory against the 'download_log'
    if os.path.exists(logDir + 'download_log.csv'):
        df_log = pd.read_csv(logDir + 'download_log.csv')
        df_search = df_log.loc[(df_log['Roll'] == roll) & (df_log['Subdirectory'] == subdirectory)]
        if len(df_search) > 0:
            print(roll + '/' + subdirectory + ' already downloaded!')
            return ''
        else:
            return roll, subdirectory
    else:
        return roll, subdirectory
    

#  

#### Check if subdirectory needs to be downloaded, then download random subdirectory:

In [None]:
stop_condition = False
stop_condition_counter = 0

while stop_condition == False:
    #Download a random subdirectory if '02_downloaded' is empty
    if len(os.listdir(downloadedDir)) == 0:
        # Connect to FTP Server
        HOSTNAME = "donnees-data.asc-csa.gc.ca"
        USERNAME = "Anonymous"
        PASSWORD = ""
        ftp = ftplib.FTP(HOSTNAME, USERNAME, PASSWORD)
        print('Connected to ftp server: ' + HOSTNAME)
        ftp_rootpath = '/users/OpenData_DonneesOuvertes/pub/AlouetteData/Alouette Data/'

        #Randomly draw roll and subdirectory (using draw_random_subdir2())
        df_inventory = pd.read_csv(logDir + 'image_inventory.csv')
        subdir_ids_tot = df_inventory['subdir_id'].unique()
        if os.path.exists(logDir + 'download_log.csv'):
            df_log = pd.read_csv(logDir + 'download_log.csv')
            subdir_ids_dl = df_log['subdir_id'].unique()
        else:
            subdir_ids_dl = []
        subdir_ids_rem = list(set(subdir_ids_tot) - set(subdir_ids_dl))
        roll, subdirectory = draw_random_subdir2(subdir_ids_list=subdir_ids_rem, logDir=logDir)

        #Set directories
        saveDir = downloadingDir + roll + '/' + subdirectory + '/'
        os.makedirs(saveDir, exist_ok=True)
        ftp.cwd(ftp_rootpath + '/' + roll + '/' + subdirectory + '/')

        start = time.time()
        n_dl = len(ftp.nlst())
        print('')
        print('Downloading ' + roll + '/' + subdirectory + '/ subdirectory ('  + str(n_dl) + ' images, ' + str(len(subdir_ids_rem)-1) + ' subdirectories to go)')
        for file in ftp.nlst():
            local_file = open(saveDir + file, "wb")
            ftp.retrbinary("RETR " + file, local_file.write)
            local_file.close()
            #print('Downloaded: ' + file)
        end = time.time()
        t = end - start
        print('Download time for subdirectory: ' + str(round(t/60, 1)) + ' min')
        print('')

        #Record subdirectory name in download_log
        df_result = pd.DataFrame({
            'Roll': roll,
            'Subdirectory': subdirectory,
            'Images_downloaded': n_dl,
            'Download_time': t,
            'Download_timestamp': datetime.fromtimestamp(end),
            'User': user,
            'subdir_id': roll + '_' + subdirectory
        }, index=[0])
        if os.path.exists(logDir + 'download_log.csv'):
            df_log = pd.read_csv(logDir + 'download_log.csv')
            df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
            df_update.to_csv(logDir + 'download_log.csv', index=False)
        else:
            df_result.to_csv(logDir + 'download_log.csv', index=False)

        #Backup 'download_log' (10% of the time)
        if randrange(10) == 7:
            df_log = pd.read_csv(logDir + 'download_log.csv')
            datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
            os.makedirs(logDir + 'backups/', exist_ok=True)
            df_log.to_csv(logDir + 'backups/' + 'download_log-' + datetime_str + '.csv', index=False)

        #Move fully downloaded subdirectory to '02_processing' folder
        move_images(old_dir=downloadingDir, new_dir=downloadedDir, roll=roll, subdir=subdirectory)
        
        stop_condition_counter += 1
    
    else:
        #Wait
        print('Wait ' + str(wait) + ' min')
        time.sleep(wait*60)
    
    
    #Check stop conditions
    if len(subdir_ids_rem) == 1:
        print('Stop!')
        stop_condition = True
    if stop_condition_counter == stop_loop_threshold:
        print('Stop!')
        stop_condition = True
        

Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207975/1130-B/ subdirectory (391 images, 2638 subdirectories to go)
Download time for subdirectory: 1.7 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207949/2119-5B/ subdirectory (317 images, 2637 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207978F/352/ subdirectory (424 images, 2636 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207821/3319-15A/ subdirectory (302 images, 2635 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207946/1839-1B/ subdirectory (333 images, 2634 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp ser

Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207821/3360-A38/ subdirectory (285 images, 2595 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207821/3365-38A/ subdirectory (317 images, 2594 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207975/1087-B/ subdirectory (358 images, 2593 subdirectories to go)
Download time for subdirectory: 1.6 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207946/1810-5B/ subdirectory (341 images, 2592 subdirectories to go)
Download time for subdirectory: 1.5 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207948/1687-5A/ subdirectory (358 images, 2591 subdirectories to go)
Download time for subdirectory: 1.5 min

Wait 5 m

Download time for subdirectory: 1.6 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207966/1185-1B/ subdirectory (388 images, 2552 subdirectories to go)
Download time for subdirectory: 1.7 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207960/2536-8A/ subdirectory (266 images, 2551 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207907F/493/ subdirectory (432 images, 2550 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207938/932-A/ subdirectory (375 images, 2549 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207929F/486/ subdirectory (426 images, 2548 sub

Download time for subdirectory: 1.1 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207975/1122-12B/ subdirectory (384 images, 2510 subdirectories to go)
Download time for subdirectory: 1.6 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207832/3623-43A/ subdirectory (316 images, 2509 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207938/933-B/ subdirectory (733 images, 2508 subdirectories to go)
Download time for subdirectory: 2.8 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207955/2848-50B/ subdirectory (329 images, 2507 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207967/1435-13B/ subdirectory (338 images, 2506 subd

Download time for subdirectory: 1.7 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207841/3092-13B/ subdirectory (340 images, 2468 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207957/2639-1A/ subdirectory (306 images, 2467 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207816/3409-43A/ subdirectory (312 images, 2466 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207938/927-A/ subdirectory (394 images, 2465 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207815/3523-50A/ subdirectory (298 images, 2464 subdirectories to go)
Download time for subdirectory: 1.2 mi

Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207948/1724-1A/ subdirectory (293 images, 2426 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207930F/684/ subdirectory (410 images, 2425 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207939/869-B/ subdirectory (289 images, 2424 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207964/1578-1B/ subdirectory (334 images, 2423 subdirectories to go)
Download time for subdirectory: 1.5 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207958/2819-7A/ subdirectory (357 images, 2422 subdirectories to go)
Download time for subdirectory

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207978F/341/ subdirectory (160 images, 2384 subdirectories to go)
Download time for subdirectory: 0.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207970/1337-5A/ subdirectory (346 images, 2383 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207908F/601/ subdirectory (361 images, 2382 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207954/2214-1A/ subdirectory (329 images, 2381 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207938/964-A/ subdirectory (297 images, 2380 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: 

Download time for subdirectory: 1.0 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207949/2165-3A/ subdirectory (329 images, 2341 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207948/1708-5A/ subdirectory (329 images, 2340 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207938/953-B/ subdirectory (377 images, 2339 subdirectories to go)
Download time for subdirectory: 1.5 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207955/2883-43B/ subdirectory (291 images, 2338 subdirectories to go)
Download time for subdirectory: 1.0 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207816/3392-13A/ subdirectory (325 images, 2337 subdirectories to go)
Downlo

Download time for subdirectory: 1.4 min

Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207967/1420-8A/ subdirectory (326 images, 2299 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207909F/706/ subdirectory (403 images, 2298 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207947/1876-9B/ subdirectory (323 images, 2297 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207951/2410-4B/ subdirectory (349 images, 2296 subdirectories to go)
Download time for subdirectory: 1.5 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207824/3140-14A/ subdirectory (251 images, 2295 subdirectories to go)
Download time for subdirectory

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207943/2069-3A/ subdirectory (365 images, 2257 subdirectories to go)
Download time for subdirectory: 1.5 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207951/2427-12A/ subdirectory (334 images, 2256 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207844/2936-43B/ subdirectory (282 images, 2255 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207943/2049-1B/ subdirectory (148 images, 2254 subdirectories to go)
Download time for subdirectory: 0.6 min

Wait 5 min
Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207980/1003-A/ subdirectory (406 images, 2253 subdirectories to go)
Download time for subdirectory: 1.6

Wait 5 min
Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207942/1985-1A/ subdirectory (305 images, 2215 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207821/3346-38A/ subdirectory (280 images, 2214 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207978F/343/ subdirectory (430 images, 2213 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207942/1968-5B/ subdirectory (285 images, 2212 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207948/1743-9/ subdirectory (304 images, 2211 subdirectories to go)
Download time for subdirectory: 1.3 min

Wait 5 min
Connected to ftp ser

Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207842/3298-50A/ subdirectory (283 images, 2173 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207975/1101-B/ subdirectory (344 images, 2172 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207907F/516/ subdirectory (427 images, 2171 subdirectories to go)
Download time for subdirectory: 1.2 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207954/2178-1A/ subdirectory (349 images, 2170 subdirectories to go)
Download time for subdirectory: 1.4 min

Wait 5 min
Connected to ftp server: donnees-data.asc-csa.gc.ca

Downloading R014207909F/715/ subdirectory (379 images, 2169 subdirectories to go)
Download time for subdirectory: 1.1 min

Wait 5 min
Wait 5 min
Connected to ftp server: donnees-d