In [31]:
import os
import pandas as pd
import numpy as np

In [32]:
"""
- opens isd-history.csv
- creates new "ID" field = "USAF"+"WBAN"
- Deletes records w/ both lat and lon == 0 or lat and lon == null
- Starts with 28,646 records and winnows the dataset down to 23,686 recores
- Writes to file "ID-Info.csv", which is basically "isd-history" with the records mentioned above
  removed
- Writes to file "IDs_23686.csv", which is simply a list of the remaining IDs.

""" 


'\n- opens isd-history.csv\n- creates new "ID" field = "USAF"+"WBAN"\n- Deletes records w/ both lat and lon == 0 or lat and lon == null\n- Starts with 28,646 records and winnows the dataset down to 23,686 recores\n- Writes to file "ID-Info.csv", which is basically "isd-history" with the records mentioned above\n  removed\n- Writes to file "IDs_23686.csv", which is simply a list of the remaining IDs.\n\n'

In [33]:
#From the ISD docs:

#Positive latitudes are north of the equator, negative latitudes are south of the equator. 
#Positive longitudes are east of Prime Meridian, negative longitudes are west of the Prime Meridian. 
#Latitude and longitude are usually expressed in that sequence, latitude before longitude.

In [34]:
os.getcwd()

'/gpfs1/home/a/a/aametcal/isd-lite/scripts/dataCleaning'

In [35]:
os.listdir()

['removeNoLatLon.ipynb', '.ipynb_checkpoints', '.DS_Store', 'ID_Lat-Lon.ipynb']

In [36]:
dat = pd.read_csv("../../data_light/isd-history.csv",dtype={'USAF':str, 'WBAN':str, 'STATION NAME':str})

In [37]:
len(dat) #orginal dataset length = 29,963

29963

In [38]:
dat = dat.dropna(subset=['LAT','LON']) #get only the entries that have non-null entries for lat/lon

In [39]:
len(dat) #dropping the entries that have null values results in 28,648 entries

28646

In [40]:
#we need to create a ID that is USAF-WBAN

In [41]:
dat['ID'] = ""

In [42]:
def makeID(dat):
    return str(dat["USAF"])+"-"+ str(dat["WBAN"])

In [43]:
dat["ID"] = dat.apply(makeID,axis =1)

In [44]:
dat.shape

(28646, 12)

In [45]:
dat.head(2)

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END,ID
2,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730,007018-99999
4,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170506,007026-99999


In [46]:
len(np.unique(dat["ID"])) #check to make sure we have unique ID

28646

In [47]:
#Now we need to make sure we get rid of the lat and longitudes that are equal to zero

In [48]:
#we need to get rid of the rows with "lat-lon" == 0.0-0.0
dat = dat[(dat.LAT !=0)&(dat.LON !=0)]


In [49]:
len(dat) #getting rid of the zero values for lat/lon results in 28,228 values

28228

In [50]:
dat.head(2)

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END,ID
18,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323,008268-99999
40,10010,99999,JAN MAYEN(NOR-NAVY),NO,,ENJA,70.933,-8.667,9.0,19310101,20170522,010010-99999


In [51]:
#I want to move the ID field to the front
cols = dat.columns.tolist()

In [52]:
cols = cols[-1:]+cols[:-1]
cols

['ID',
 'USAF',
 'WBAN',
 'STATION NAME',
 'CTRY',
 'STATE',
 'ICAO',
 'LAT',
 'LON',
 'ELEV(M)',
 'BEGIN',
 'END']

In [53]:
dat = dat[cols]
dat.head(2)

Unnamed: 0,ID,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
18,008268-99999,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
40,010010-99999,10010,99999,JAN MAYEN(NOR-NAVY),NO,,ENJA,70.933,-8.667,9.0,19310101,20170522


In [54]:
dat.to_csv("../../data_light/ID_Info.csv",index=False)

In [55]:
#we need a list of the unique id's that came out of this first part of data cleaning

#first of all, how many stations are left?
len(dat)

28228

In [56]:
#okay, now let's get a list of the id's
ids = dat['ID'].values
type(ids), len(ids)

(numpy.ndarray, 28228)

In [57]:
#now let's write it to file
np.savetxt('../../data_light/IDs_28228.csv',ids,fmt='%s',delimiter=',')