In [224]:
import os
import pandas as pd
import numpy as np

In [225]:
"""
- opens isd-history.csv
- creates new "ID" field = "USAF"+"WBAN"
- Deletes records w/ both lat and lon == 0 or lat and lon == null
- Starts with 28,646 records and winnows the dataset down to 23,686 recores
- Writes to file "ID-Info.csv", which is basically "isd-history" with the records mentioned above
  removed
- Writes to file "IDs_23686.csv", which is simply a list of the remaining IDs.

""" 


'\n- opens isd-history.csv\n- creates new "ID" field = "USAF"+"WBAN"\n- Deletes records w/ both lat and lon == 0 or lat and lon == null\n- Starts with 28,646 records and winnows the dataset down to 23,686 recores\n- Writes to file "ID-Info.csv", which is basically "isd-history" with the records mentioned above\n  removed\n- Writes to file "IDs_23686.csv", which is simply a list of the remaining IDs.\n\n'

In [226]:
#From the ISD docs:

#Positive latitudes are north of the equator, negative latitudes are south of the equator. 
#Positive longitudes are east of Prime Meridian, negative longitudes are west of the Prime Meridian. 
#Latitude and longitude are usually expressed in that sequence, latitude before longitude.

In [227]:
os.getcwd()

'/gpfs1/home/a/a/aametcal/isd-lite/scripts/dataCleaning'

In [228]:
os.listdir()

['removeNoLatLon.ipynb',
 '.ipynb_checkpoints',
 '.DS_Store',
 'test',
 'data',
 'ID_Lat-Lon.ipynb']

In [229]:
dat = pd.read_csv("data/isd-history.csv",dtype={'USAF':str, 'WBAN':str, 'STATION NAME':str})

In [230]:
dat = dat.dropna(subset=['LAT','LON']) #get only the entries that have non-null entries for lat/lon

In [231]:
#we need to create a ID that is USAF-WBAN

In [232]:
dat['ID'] = ""

In [233]:
def makeID(dat):
    return str(dat["USAF"])+"-"+ str(dat["WBAN"])

In [234]:
dat["ID"] = dat.apply(makeID,axis =1)

In [235]:
dat.shape

(28646, 12)

In [236]:
dat.head(2)

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END,ID
2,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730,007018-99999
4,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170506,007026-99999


In [237]:
len(np.unique(dat["ID"])) #check to make sure we have unique ID

28646

In [238]:
#Now we need to make sure we get rid of the lat and longitudes that are equal to zero

In [239]:
#we need to get rid of the rows with "lat-lon" == 0.0-0.0
dat = dat[(dat.LAT !=0)&(dat.LON !=0)]


In [240]:
dat.head(2)

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END,ID
18,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323,008268-99999
40,10010,99999,JAN MAYEN(NOR-NAVY),NO,,ENJA,70.933,-8.667,9.0,19310101,20170522,010010-99999


In [241]:
#I want to move the ID field to the front
cols = dat.columns.tolist()

In [242]:
cols = cols[-1:]+cols[:-1]
cols

['ID',
 'USAF',
 'WBAN',
 'STATION NAME',
 'CTRY',
 'STATE',
 'ICAO',
 'LAT',
 'LON',
 'ELEV(M)',
 'BEGIN',
 'END']

In [243]:
dat = dat[cols]
dat.head(2)

Unnamed: 0,ID,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
18,008268-99999,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
40,010010-99999,10010,99999,JAN MAYEN(NOR-NAVY),NO,,ENJA,70.933,-8.667,9.0,19310101,20170522


In [244]:
dat.head(5)

Unnamed: 0,ID,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
18,008268-99999,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
40,010010-99999,10010,99999,JAN MAYEN(NOR-NAVY),NO,,ENJA,70.933,-8.667,9.0,19310101,20170522
42,010014-99999,10014,99999,SORSTOKKEN,NO,,ENSO,59.792,5.341,48.8,19861120,20170522
43,010015-99999,10015,99999,BRINGELAND,NO,,,61.383,5.867,327.0,19870117,20111020
44,010016-99999,10016,99999,RORVIK/RYUM,NO,,,64.85,11.233,14.0,19870116,19910806


In [245]:
dat.to_csv("data/ID_Info.csv",index=False)

In [246]:
os.listdir()

['removeNoLatLon.ipynb',
 '.ipynb_checkpoints',
 '.DS_Store',
 'test',
 'data',
 'ID_Lat-Lon.ipynb']

In [247]:

dat.loc[dat['ID']=='999999-94644']

Unnamed: 0,ID,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
29923,999999-94644,999999,94644,OLD TOWN 2 W,US,ME,,44.928,-68.701,38.7,20020913,20170523


In [248]:
#we need a list of the unique id's that came out of this first part of data cleaning

#first of all, how many stations are left?
len(dat)

28228

In [249]:
#okay, now let's get a list of the id's
ids = dat['ID'].values
type(ids), len(ids)

(numpy.ndarray, 28228)

In [250]:
#now let's write it to file
np.savetxt('data/IDs_28228.csv',ids,fmt='%s',delimiter=',')