# Download Subdirectories From FTP

#### Updated: Dec 3, 2022

#  

Determine how to download subdirectories from FTP. Record download in a 'download_log'. Move downloaded subdirectories from '01_intake' folder to '02_processing' folder. 

In [1]:
import pandas as pd
import os
import time
from datetime import datetime
import ftplib

In [2]:
rootDir = 'C:/Users/rnaidoo/Documents/Projects_data/Alouette_I/'
intakeDir = rootDir + '01_intake/'
processingDir = rootDir + '02_processing/'
resultDir = rootDir + '04_result/'

In [5]:
HOSTNAME = "donnees-data.asc-csa.gc.ca"
USERNAME = "Anonymous"
PASSWORD = ""

# Connect FTP Server
ftp = ftplib.FTP(HOSTNAME, USERNAME, PASSWORD)
print('Connected to ftp server: ' + HOSTNAME)

ftp_rootpath = '/users/OpenData_DonneesOuvertes/pub/AlouetteData/Alouette Data/'

Connected to ftp server: donnees-data.asc-csa.gc.ca


#  

#### Download a subdirectory by FTP:

In [None]:
roll = 'R014207815'
subdirectory = '3488-15A'

In [None]:
saveDir = intakeDir + '/' + roll + '/' + subdirectory + '/'
os.makedirs(saveDir, exist_ok=True)
ftp.cwd(ftp_rootpath + '/' + roll + '/' + subdirectory + '/')

In [None]:
start = time.time()
n_dl = len(ftp.nlst())
print('Downloading ' + str(n_dl) + ' images...')
for file in ftp.nlst():
    local_file = open(saveDir + file, "wb")
    ftp.retrbinary("RETR " + file, local_file.write)
    local_file.close()
    #print('Downloaded: ' + file)
end = time.time()
t = end - start
print('Download time for subdirectory: ' + str(round(t/60, 1)) + ' min')

Record subdirectory name in download_log:

In [None]:
df_result = pd.DataFrame({
    'Roll': roll,
    'Subdirectory': subdirectory,
    'Images_downloaded': n_dl,
    'Download_Time': t,
    'Download_timestamp': datetime.fromtimestamp(end)
}, index=[0])
df_result

In [None]:
if os.path.exists(resultDir + 'download_log.csv'):
    df_log = pd.read_csv(resultDir + 'download_log.csv')
    df_update = pd.concat([df_log, df_result], axis=0, ignore_index=True)
    df_update.to_csv(resultDir + 'download_log.csv', index=False)
else:
    df_result.to_csv(resultDir + 'download_log.csv', index=False)

Move fully downloaded subdirectory to 02_processing folder:

In [None]:
for file in os.listdir(saveDir):
    newDir = processingDir + '/' + roll + '/' + subdirectory + '/'
    os.makedirs(newDir, exist_ok=True)
    os.rename(saveDir+file, newDir+file)

#  

#### Determine how many subdirectories there are:

Estimate how long it would take the download all subdirectories. Estimate what would be the total size.

In [11]:
ftp.cwd(ftp_rootpath)
rolls_list = ftp.nlst()

In [8]:
n_subdir = 0
for roll in rolls_list:
    ftp.cwd(roll)
    subdir_list = ftp.nlst()
    n_subdir_ = len(subdir_list)
    print('Roll ' + roll + ' has ' + str(n_subdir_) + ' subdirectories.')
    n_subdir += n_subdir_
    ftp.cwd(ftp_rootpath)

Roll R014207815 has 60 subdirectories.
Roll R014207816 has 60 subdirectories.
Roll R014207821 has 60 subdirectories.
Roll R014207823 has 60 subdirectories.
Roll R014207824 has 60 subdirectories.
Roll R014207832 has 60 subdirectories.
Roll R014207840 has 60 subdirectories.
Roll R014207841 has 60 subdirectories.
Roll R014207842 has 59 subdirectories.
Roll R014207844 has 60 subdirectories.
Roll R014207907F has 61 subdirectories.
Roll R014207908F has 60 subdirectories.
Roll R014207909F has 55 subdirectories.
Roll R014207929F has 57 subdirectories.
Roll R014207930F has 73 subdirectories.
Roll R014207938 has 59 subdirectories.
Roll R014207939 has 59 subdirectories.
Roll R014207940F has 58 subdirectories.
Roll R014207942 has 59 subdirectories.
Roll R014207943 has 60 subdirectories.
Roll R014207946 has 60 subdirectories.
Roll R014207947 has 60 subdirectories.
Roll R014207948 has 60 subdirectories.
Roll R014207949 has 60 subdirectories.
Roll R014207951 has 61 subdirectories.
Roll R014207953 has

In [10]:
print('There are ' + str(n_subdir) + ' subdirectories.')

There are 2638 subdirectories.


If one subdirectory takes ~ 23 min to download, it would take ~ 42 days to download all images...

If one subdirectory is ~ 336 MB, the total size of all of the images is ~ 889 GB (0.9 TB)...