# Overview

1. Before you download data from CoRR, make sure that you have a NITRC account (https://www.nitrc.org) and that you are registered with the 1000 Functional Connectomes Project (https://www.nitrc.org/projects/fcon_1000/). 

2. This notebook should work regardless if you're downloading data on your local machine or on a server

3. Run through the notebook. You will need to input your NITRC username, password and the directory you want to download the CoRR data into at the requested locations.

4. The wget does not always work. It will work the first time you run it, but if you interrupt the kernel while it's running and try to do it again, it will fail to connect to the server. I am still trying to figure a way around this issue.

# Setup

In [None]:
# import packages
import os
from bs4 import BeautifulSoup
import requests 
from lxml import html 
import pandas as pd
from requests import Session
import urllib3
import urllib.request
from pathlib import Path

In [None]:
# constants
# url for corr
web_start = 'http://'
nitrc_start = 'https://www.nitrc.org'
url = 'fcon_1000.projects.nitrc.org/indi/CoRR/html/samples.html'

# Data URLs

In [None]:
data_links = [] # array that will hold links to the different datasets making up CoRR
download_links = [] # array that will hold download links from all the datasets in data_links

In [None]:
user_name = '' #insert your user name for NITRC here
password = '' #insert your passwork for NITRC here

auth = {"form_pw": password, "form_loginname": user_name} # collect authentificatio information

s = Session() # this session will hold cookies and remember login information

# here we first login and get our session cookie
s.post("https://www.nitrc.org/account/login.php", auth)

In [None]:
# extract links of datasets from corr website
r = requests.get(web_start + url)
data = r.text
soup = BeautifulSoup(data, 'lxml')

# get the links to all the datasets
for link in soup.find_all('a'):
    if link.get('class') == ['reference', 'internal'] and link.get('href') not in ['qc.html', 'download.html', 'data_citation.html', '#datasets']:
        data_links.append(link.get('href'))

# Data

In [None]:
# get internal url link to the datasets
url_prefix = Path(url).parent.as_posix()

In [None]:
# go through each link if data_links and extract the links to their respective data downloads
for link in data_links:
    soup = BeautifulSoup((s.get(web_start + url_prefix+'/' + link)).text,
    'lxml')
    
    # check the links of each dataset website
    for l in soup.find_all('a'):
        #get all the links that contain download files into the download_links array
        if 'php' in l.get('href') and 'downloadlink' in l.get('href'):
            
            # hit the 'I agree' button on the NITRC website
            temp_soup = BeautifulSoup(s.post(l.get('href') + '/?i_agree=1', auth).content, 'lxml')
            
            # get the download link from the redirected website
            for d_link in temp_soup.find_all('a'):
                if str('/frs/downloadlink') in str(d_link.get('href')):
                    print('Adding:', link, '\n', nitrc_start + d_link.get('href'))
                    download_links.append(nitrc_start + d_link.get('href'))

In [None]:
error_files = []
# requests don't have adapters to ftp websites
# but the http links in download_links redirect to an ftp server
# that is why we catch the InvalidSchema error and extract the ftp links from the error message
for download_file in download_links:
    try:
        s.get(download_file)
    except requests.exceptions.RequestException as e:
        # Print Invalid Schema error and add it error_files
        # There might be other errors that we're catching
        print("Error: {}".format(e))
        error_files.append("{}".format(e))

In [None]:
# create a dataframe with the error messages
df_files = pd.DataFrame(error_files, columns=['Error_Message'])
# extract the ftp link from error message
df_files = df_files.assign(ftp_files = 
        [(x.split("No connection adapters were found for ")[1]) if 'Exceeded' not in str(x) else 'None' for x in error_files])
# remove quotes from the ftp link
df_files = df_files.assign(ftp_paths = 
        [x[1:len(x)-1] for x in df_files.ftp_files])
# remove data from any error messages that aren't InvalidSchema
df_files = df_files[df_files.ftp_paths != 'on']

In [None]:
# display df and make sure that everything looks right
df_files.head()

In [None]:
file_directory = '' # insert directory where you want to store your CoRR files

# go through each ftp file in the dataframe and download it
for file in df_files.ftp_paths.values:
    if file != 'on':
        %cd {file_directory}
        !wget -t inf -T 600 -c -v --passive-ftp --progress=bar -i {file}