# Download Subdirectories From FTP

#### Updated: Dec 6, 2022

#  

Determine how to download subdirectories from FTP. Record download in a 'download_log'. Move downloaded subdirectories from '01_downloading' folder to '02_downloaded' folder. 

In [1]:
import pandas as pd
import os
import shutil
import time
from datetime import datetime
import ftplib
from random import randrange

In [2]:
process_on_VDI = True

rootDir_local = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/' #files on C:/ are not persistent on VDI
downloadingDir = rootDir_local + '01_downloading/'
downloadedDir = rootDir_local + '02_downloaded/'
if process_on_VDI:
    logDir = '//scientific/L-MP-Data/Massive files/Python/rnaidoo/Alouette_I/' #DO NOT CHANGE
else:
    logDir = rootDir_local + '05_result/'

In [3]:
HOSTNAME = "donnees-data.asc-csa.gc.ca"
USERNAME = "Anonymous"
PASSWORD = ""

# Connect FTP Server
ftp = ftplib.FTP(HOSTNAME, USERNAME, PASSWORD)
print('Connected to ftp server: ' + HOSTNAME)

ftp_rootpath = '/users/OpenData_DonneesOuvertes/pub/AlouetteData/Alouette Data/'

Connected to ftp server: donnees-data.asc-csa.gc.ca


#  

#### Functions:

In [4]:
def move_images(old_dir, new_dir, roll, subdir):
    oldDir = old_dir + roll + '/' + subdir + '/'
    newDir = new_dir + roll + '/' + subdir + '/'
    os.makedirs(newDir, exist_ok=True)
    for file in os.listdir(oldDir):
        os.rename(oldDir+file, newDir+file)
    shutil.rmtree(old_dir + roll + '/')

In [5]:
def draw_random_subdir(roll, subdir_list, logDir):
    
    subdirectory = subdir_list[randrange(len(subdir_list))]
    
    #Check randomly-selected roll and subdirectory against the 'download_log'
    if os.path.exists(logDir + 'download_log.csv'):
        df_log = pd.read_csv(logDir + 'download_log.csv')
        df_search = df_log.loc[(df_log['Roll'] == roll) & (df_log['Subdirectory'] == subdirectory)]
        if len(df_search) > 0:
            print(roll + '/' + subdirectory + ' already downloaded!')
            return ''
        else:
            return subdirectory
    else:
        return subdirectory

#  

#### Download a random subdirectory by FTP:

In [6]:
#Draw random roll
ftp.cwd(ftp_rootpath)
roll_list = ftp.nlst()
#roll_list = ['R014207844']
roll = roll_list[randrange(len(roll_list))]

ftp.cwd(roll)
subdir_list = ftp.nlst()
#subdir_list = ['2920-43B', 'Other']

#Randomly draw roll and subdirectory
subdirectory = ''
while (subdirectory == ''):
    subdirectory = draw_random_subdir(roll=roll, subdir_list=subdir_list, logDir=logDir)

#roll = 'R014207815'
#subdirectory = '3488-15A'

In [7]:
saveDir = downloadingDir + roll + '/' + subdirectory + '/'
os.makedirs(saveDir, exist_ok=True)
ftp.cwd(ftp_rootpath + '/' + roll + '/' + subdirectory + '/')

'250 Directory successfully changed.'

In [8]:
start = time.time()
n_dl = len(ftp.nlst())
print('Downloading ' + roll + '/' + subdirectory + '/ subdirectory ('  + str(n_dl) + ' images)')
for file in ftp.nlst():
    local_file = open(saveDir + file, "wb")
    ftp.retrbinary("RETR " + file, local_file.write)
    local_file.close()
    #print('Downloaded: ' + file)
end = time.time()
t = end - start
print('Download time for subdirectory: ' + str(round(t/60, 1)) + ' min')

Downloading R014207821/3342-A38/ subdirectory (250 images)
Download time for subdirectory: 1.1 min


Download to C:/ (CSA laptop, on VPN) --> ~ 23 min <br>
Download to C:/ (CSA laptop, off VPN) --> 4.5 min <br>
Download to C:/ (VDI HP) --> 1.1 min <-- downloading locally seems to be faster <br>
Download to U:/ (VDI HP) --> 1.3 min <br>
<br>
If one subdirectory takes ~ 1.1 min to download, it would take ~ 48.4 h to download all images... <br>
If one subdirectory is ~ 336 MB, the total size of all of the images is ~ 889 GB (0.9 TB)...

Record subdirectory name in download_log:

In [9]:
df_result = pd.DataFrame({
    'Roll': roll,
    'Subdirectory': subdirectory,
    'Images_downloaded': n_dl,
    'Download_time': t,
    'Download_timestamp': datetime.fromtimestamp(end)
}, index=[0])
df_result

Unnamed: 0,Roll,Subdirectory,Images_downloaded,Download_time,Download_timestamp
0,R014207821,3342-A38,250,63.627059,2022-12-07 14:49:15.418314


In [10]:
if os.path.exists(logDir + 'download_log.csv'):
    df_log = pd.read_csv(logDir + 'download_log.csv')
    df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
    df_update.to_csv(logDir + 'download_log.csv', index=False)
else:
    df_result.to_csv(logDir + 'download_log.csv', index=False)

Backup 'download_log' (10% of the time):

In [11]:
if randrange(10) == 7:
    df_log = pd.read_csv(logDir + 'download_log.csv')
    datetime_str = datetime.now().strftime("%Y%m%d_%Hh%M")
    os.makedirs(logDir + 'backups/', exist_ok=True)
    df_log.to_csv(logDir + 'backups/' + 'download_log-' + datetime_str + '.csv', index=False)

Move fully downloaded subdirectory to 02_processing folder:

In [12]:
move_images(old_dir=downloadingDir, new_dir=downloadedDir, roll=roll, subdir=subdirectory)