In [4]:
from ftplib import FTP
import pandas as pd
import os

In [11]:
# connect to NOAA FTP site and navigate to the daily folder
ftp = FTP('ftp.ncdc.noaa.gov')
ftp.login()
ftp.cwd('pub/data/ghcn/daily')

'250 CWD command successful'

In [12]:
def retrieveFile(fileName):
    """
Downloads a file from a configured FTP site.

Params
------
fileName : name of file on FTP site to download

Example FTP Config
------------------
ftp = FTP('ftp.ncdc.noaa.gov')
ftp.login()
ftp.cwd('pub/data/ghcn/daily')
    """

    # setting up local file for retrieval
    localFile = open(fileName, 'wb')
    
    # actual ftp retrieve
    ftp.retrbinary('RETR {}'.format(fileName),localFile.write,8*1024)

In [13]:
# Retrieve stations file
retrieveFile('ghcnd-stations.txt')

# Retrieve states file
retrieveFile('ghcnd-states.txt')

# Retrieve countries file
retrieveFile('ghcnd-countries.txt')

In [15]:
# change to all/ directory
ftp.cwd('all/')

# write lilnes in all/ directory to ftpLines list
ftpLines = []
ftp.retrlines('LIST', ftpLines.append)

'226 Transfer complete'

In [19]:
# iterate through ftpLines and trimmed down the string text to just the filename
fileList = []
for i in range(0,(len(ftpLines)-1)):
    fileList.append(ftpLines[i][-15:])

# then make a DataFrame out of the list
fileList_DF = pd.DataFrame(fileList,columns=['StationFile'])

In [20]:
fileList_DF.head()

Unnamed: 0,StationFile
0,ACW00011604.dly
1,ACW00011647.dly
2,AE000041196.dly
3,AEM00041194.dly
4,AEM00041217.dly


In [28]:
# pull StationID out of StationFile by trimming the file extension
def stationID(row):
    return row['StationFile'][:-4]

fileList_DF['StationID'] = fileList_DF.apply(stationID, axis=1)

# then make StationID the index - this will make joining this data to stations.txt data easier
fileList_DF.set_index('StationID', inplace=True)

In [29]:
fileList_DF.head()

Unnamed: 0_level_0,StationFile
StationID,Unnamed: 1_level_1
ACW00011604,ACW00011604.dly
ACW00011647,ACW00011647.dly
AE000041196,AE000041196.dly
AEM00041194,AEM00041194.dly
AEM00041217,AEM00041217.dly


In [32]:
# read stations.txt into DataFrame
stationsDF = pd.read_fwf('ghcnd-stations.txt', header=None, delimiter=' '
                         , widths=[12,9,10,7,3,31,4,4,6]
                         , names=['StationID', 'Latitude', 'Longitude', 'Elevation',
                                 'State', 'Name', 'GSN Flag', 'HCN/CRN Flag', 'WMO ID']
                         )

# set StationID as the index
stationsDF.set_index('StationID', inplace=True)

In [33]:
stationsDF.head()

Unnamed: 0_level_0,Latitude,Longitude,Elevation,State,Name,GSN Flag,HCN/CRN Flag,WMO ID
StationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
AE000041196,25.333,55.517,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
AEM00041194,25.255,55.364,10.4,,DUBAI INTL,,,41194.0
AEM00041217,24.433,54.651,26.8,,ABU DHABI INTL,,,41217.0
