# NOAA Retrieve Files via FTP
#### Description:
This notebook is used to retrieve NOAA Daily Files through a public FTP connection and place them in the current working directory.

Source: https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/

#### Files Retrieved:
- ghcnd-stations.txt
- ghcnd-states.txt
- ghcnd-countries.txt
- ghcnd-inventory.txt

#### Created by:
Nate Muth <br>
nmuth87@gmail.com

#### Created on:
7/29/2018

#### Changelog:
7/29/2018 - Initial Create Date<br>
8/5/2018 - Added lists of files to download in each directory in the config section<br>

In [1]:
# CONFIG

# metadataFiles are stored in the root folder
metadataFiles = ['ghcnd-stations.txt' # Retrieve stations file
                 ,'ghcnd-states.txt' # Retrieve states file
                 ,'ghcnd-countries.txt' # Retrieve countries file
                 ,'ghcnd-inventory.txt' # Retrieve inventory file
                 ]

dailyFiles_folder1 = 'stations_raw_data/'
dailyFiles_list1 = ['USC00250740.dly' # BELLEVUE, NE, USA
                   ,'USW00027502.dly' # BARROW POST ROGERS AP, AK, USA
                   ,'AGM00060670.dly' # TISKA, ALGERIA (Sahara Desert)
                   ,'MX000766800.dly' # MEXICO CITY, MEXICO
                   ,'JA000047662.dly' # TOKYO, JAPAN
                   ]

dailyFiles_folder2 = 'midwest20/'
dailyFiles_list2 = ['US1ILCK0036.dly' # Chicago, IL
                    ,'USC00331778.dly' # Columbus, OH
                    ,'USC00128271.dly' # Indianapolis, IN
                    ,'USC00475469.dly' # Milwaukee, WI
                    ,'USW00014822.dly' # Detroit, MI
                    ,'USW00093821.dly' # Louisville, KY
                    ,'USC00234379.dly' # Kansas City, MO
                    ,'USW00014942.dly' # Omaha, NE
                    ,'USW00014922.dly' # Minneapolis, MN
                    ,'USW00003974.dly' # Wichita, KS
                    ,'USW00004853.dly' # Cleveland, OH
                    ,'USW00093820.dly' # Lexington, KY
                    ,'USC00237452.dly' # St. Louis, MO
                    ,'USC00218450.dly' # St. Paul, MN
                    ,'USW00003871.dly' # Cincinnati, OH
                    ,'USC00254810.dly' # Lincoln, NE
                    ,'USC00338351.dly' # Toledo, OH
                    ,'US1INAL0037.dly' # Fort Wayne, IN
                    ,'USW00014837.dly' # Madison, WI
                    ,'USW00014933.dly' # Des Moines, IA
                    ]

In [2]:
from ftplib import FTP
import pandas as pd
import os

In [3]:
# connect to NOAA FTP site and navigate to the daily folder
ftp = FTP('ftp.ncdc.noaa.gov')
ftp.login()
ftp.cwd('pub/data/ghcn/daily')

'250 CWD command successful'

In [4]:
def retrieveFile(fileName, folder=''):
    """
Downloads a file from a configured FTP site.

Params
------
fileName : name of file on FTP site to download

Example FTP Config
------------------
ftp = FTP('ftp.ncdc.noaa.gov')
ftp.login()
ftp.cwd('pub/data/ghcn/daily')
    """

    # setting up local file for retrieval
    localFile = open((folder + fileName), 'wb')
    
    # actual ftp retrieve
    ftp.retrbinary('RETR {}'.format(fileName),localFile.write,8*1024)
    
for f in metadataFiles:
    retrieveFile(f)

### Switch to all/ directory and retrieve dailyFiles lists

In [7]:
# change to all/ directory
ftp.cwd('all/')

# get files for list1
for f in dailyFiles_list1:
    retrieveFile(f,dailyFiles_folder1)
    
# get files for list2
for f in dailyFiles_list2:
    retrieveFile(f,dailyFiles_folder2)

### file list in directory into pandas DataFrame - informational

In [6]:
# write lilnes in all/ directory to ftpLines list
ftpLines = []
ftp.retrlines('LIST', ftpLines.append)

'226 Transfer complete'

In [7]:
# iterate through ftpLines and trimmed down the string text to just the filename
fileList = []
for i in range(0,(len(ftpLines)-1)):
    fileList.append(ftpLines[i][-15:])

# then make a DataFrame out of the list
fileList_DF = pd.DataFrame(fileList,columns=['StationFile'])

In [8]:
fileList_DF.head()

Unnamed: 0,StationFile
0,ACW00011604.dly
1,ACW00011647.dly
2,AE000041196.dly
3,AEM00041194.dly
4,AEM00041217.dly


In [9]:
# pull StationID out of StationFile by trimming the file extension
def stationID(row):
    return row['StationFile'][:-4]

fileList_DF['StationID'] = fileList_DF.apply(stationID, axis=1)

# then make StationID the index - this will make joining this data to stations.txt data easier
fileList_DF.set_index('StationID', inplace=True)

In [10]:
fileList_DF.head()

Unnamed: 0_level_0,StationFile
StationID,Unnamed: 1_level_1
ACW00011604,ACW00011604.dly
ACW00011647,ACW00011647.dly
AE000041196,AE000041196.dly
AEM00041194,AEM00041194.dly
AEM00041217,AEM00041217.dly
