In [None]:
%reset -f

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import requests
from datetime import datetime as dt
from datetime import timedelta
import os
import scipy as scipy
import statsmodels.api as sm
from google.colab import data_table
data_table.enable_dataframe_formatter()
import csv
import geopandas as gpd
from geopy.distance import geodesic


stationIDLINK = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
metadataLINK = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt'
dataLINK = 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access'
coordFILE = '/content/drive/My Drive/Colab Notebooks/CLIGEN/US_CLIGEN_Coords.csv'
keepFILE = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Keepers.csv'

max_dist = 100000.0 #meters
percent_complete = 50.0 #%

dates = []
start_date = dt(1974, 1, 1, 0, 0)
end_date = dt(2013, 12, 31, 0, 0)
date = start_date
while date <= end_date:
  dates.append(date)
  date = date + timedelta(days=1)

stations, longitudes, latitudes = [], [], []
with open(coordFILE) as f:
  next(f)
  for line in f:
    row = line.strip('\n').split(',')
    stations.append(row[0])
    longitudes.append(row[1])
    latitudes.append(row[2])

cli_df = pd.DataFrame(data=zip(stations, longitudes, latitudes), columns=['stationID', 'long', 'lat'])

url = stationIDLINK
req = requests.get(url)
text = req.text

stations, longitudes, latitudes = [], [], []
lines = (line for line in text.splitlines())
for line in lines:
  row = line.split()
  stationID = row[0]
  if stationID[:2] == 'US':
    stations.append(row[0])
    latitudes.append(row[1])
    longitudes.append(row[2])

gnd_df = pd.DataFrame(data=zip(stations, longitudes, latitudes), columns=['stationID', 'long', 'lat'])

#EPSG:4326 geodetic coordinates -> 'EPSG:3857' meters
cli_gdf = gpd.GeoDataFrame(
  cli_df, geometry=gpd.points_from_xy(cli_df['long'], cli_df['lat']), crs='EPSG:3857'
)

gnd_gdf = gpd.GeoDataFrame(
  gnd_df, geometry=gpd.points_from_xy(gnd_df['long'], gnd_df['lat']), crs='EPSG:3857'
)

join_gdf = gpd.sjoin_nearest(cli_gdf, gnd_gdf, how='left', distance_col='dist')
join_gdf.to_crs('EPSG:3857')

join_df = pd.DataFrame(join_gdf.drop(columns=['geometry', 'index_right']))
join_df['dist_m'] = join_df.apply(lambda x: geodesic((x['lat_left'], x['long_left']), (x['lat_right'], x['long_right'])).meters, axis=1)

keepers_one_df = join_df[join_df['dist_m'] < max_dist].reset_index()

keepers_step_one = keepers_one_df['stationID_right'].values

url = metadataLINK
req = requests.get(url)
text = req.text

keepers_step_two = []
lines = (line for line in text.splitlines())
for line in lines:
  row = line.split()
  if 'PRCP' in row and row[0] in keepers_step_one:
    if int(row[4]) <= 1974 and int(row[5]) >= 2013:
      keepers_step_two.append(row[0])

#No html address
bad = []
for keeper in keepers_step_two:

  try:
    ct = 0
    url = dataLINK + '/' + keeper + '.csv'
    req = requests.get(url)
    text = req.text
    if not '404 Not Found' in text:
      ct = 0
      lines = [line for line in text.splitlines()]
      save_lines = []
      hdrs = lines[0].split(',')
      prcp_i = hdrs.index('"PRCP"')
      date_i = hdrs.index('"DATE"')
      for line in lines[1:]:
        row = line.split('","')
        name_no_comma = row[5].replace(',', '')
        line = line.replace(row[5], name_no_comma)
        line = line.replace('","', ',')
        row = line.split(',')
        date = dt.strptime(row[date_i].strip('"'), '%Y-%m-%d')
        prcp = row[prcp_i].strip('"')
        if date.year >= 1974 and date.year <= 2013:
          if prcp != '' and not any([s in prcp for s in ['P', 'T', 'H', '9999']]):
            prcp = float(prcp)
            ct += 1

      if float(ct)/float(len(dates))*100. < percent_complete:
        print(str(float(ct)/float(len(dates))*100))
        bad.append(keeper)

    else:
      bad.append(keeper)

  except requests.exceptions.Timeout:
    pass


keepers_step_three = [k for k in keepers_step_two if k not in bad]

keepers_df = keepers_one_df.loc[keepers_one_df['stationID_right'].isin(keepers_step_three)].reset_index()

keepers_df.to_csv(keepFILE)



28.507871321013006
27.077344284736483
0.0
39.31553730321697
0.0
34.544832306639286


In [None]:
from datetime import datetime as dt
from datetime import timedelta
import pandas as pd
import requests
import os

keepFILE = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Keepers.csv'
dataLINK = 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access'
outFOLDER = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Output/GHCN_Dataframe_for_Keepers_95'
keepers_df = pd.read_csv(keepFILE)
keepers = keepers_df['stationID_right'].values

dates = []
start_date = dt(1974, 1, 1, 0, 0)
end_date = dt(2013, 12, 31, 0, 0)
date = start_date
while date <= end_date:
  dates.append(date)
  date = date + timedelta(days=1)

i = 0
done = False
while done == False:

  keeper = keepers[i]
  print(keeper)
  df = pd.DataFrame()
  df.index.name = 'index'
  df['year'] = [date.year for date in dates]
  df['month'] = [date.month for date in dates]
  df['day'] = [date.day for date in dates]
  df[keeper] = [0.0 for date in dates]

  outFILE = os.path.join(outFOLDER, keeper + '.csv')
  url = dataLINK + '/' + keeper + '.csv'
  req = requests.get(url)
  text = req.text
  if not '404 Not Found' in text:

    lines = [line for line in text.splitlines()]
    hdrs = lines[0].split(',')
    prcp_col_i = hdrs.index('"PRCP"')
    date_col_i = hdrs.index('"DATE"')

    for line in lines[1:]:
      row = line.split('","')
      name_no_comma = row[5].replace(',', '')
      line = line.replace(row[5], name_no_comma)
      line = line.replace('","', ',')
      row = line.split(',')
      lon = row[3]
      lat = row[2]
      date = dt.strptime(row[date_col_i].strip('"'), '%Y-%m-%d')
      prcp = row[prcp_col_i].strip('"')

      if date.year >= 1974 and date.year <= 2013:

        if prcp != '' and not any([s in prcp for s in ['P', 'T', 'H', '9999']]):
          prcp = float(prcp)/10.0
          df.loc[(df['year'] == date.year) & (df['month']==date.month) & (df['day']==date.day), keeper] = prcp

  i += 1

  if i == len(keepers):
    done = True

  df.to_csv(outFILE)




In [None]:

import pandas as pd
import os

keepFILE = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Keepers95.csv'
parFOLDER = '/content/drive/My Drive/Colab Notebooks/CLIGEN/2015parfiles/2015parfiles/2015parfiles'
anuragFOLDER = '/content/drive/My Drive/Colab Notebooks/TeamProjects/CLIGEN'
dataFOLDER = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Output/GHCN_Dataframes_for_Keepers_95'
shareFOLDER = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Share'

keep_df = pd.read_csv(keepFILE)

gnd_stations = keep_df['stationID_right'].values

anuragFILES = os.listdir(anuragFOLDER)
parFILES = [f for f in os.listdir(parFOLDER) if f != 'stations2015.txt' ]
matches_dct = {}
for anuragf in anuragFILES:
  with open(os.path.join(anuragFOLDER, anuragf)) as f:
    next(f); next(f); next(f); next(f)
    line= f.readline()
    row = line.split()
    latitude = row[0]
    longitude = row[1]

  matches_dct[anuragf] = None

  for parf in parFILES:
    with open(os.path.join(parFOLDER, parf)) as f:
      next(f)
      line = f.readline()
      row = line.split('=')
      lat = row[1].split()[0]
      lon = row[2].split()[0]

    if latitude == lat and longitude == lon:
      matches_dct[anuragf] = parf
      break


print(matches_dct)
print(len(matches_dct))


print(keep_df)








{'10_St. Clair Co IL_CLIGEN.cli': 'il110510.par', '11_Washington Co AR_CLIGEN.cli': 'ar032443.par', '12_Crosby Co TX_CLIGEN.cli': 'tx412121.par', '13_Pratt Co KS_CLIGEN.cli': 'ks146549.par', '14_Washington Co CO_CLIGEN.cli': 'co050114.par', '15_Prairie Co MT_CLIGEN.cli': 'mt248169.par', '16_Maricopa Co AZ_CLIGEN.cli': 'az024829.par', '17_Fresno Co CA_CLIGEN.cli': 'ca040449.par', '18_Jerome Co ID_CLIGEN.cli': 'id104670.par', '19_Adams Co WA_CLIGEN.cli': 'wa454679.par', '1_Steuben Co NY_CLIGEN.cli': 'ny300448.par', '20_Linn Co OR_CLIGEN.cli': 'or354811.par', '2_Randolf Co NC_CLIGEN.cli': 'nc310286.par', '3_Jackson Co FL_CLIGEN.cli': 'fl081544.par', '4_Putnam Co OH_CLIGEN.cli': 'oh336405.par', '5_Bolivar Co MS_CLIGEN.cli': 'ms221707.par', '6_Adair Co IA_CLIGEN.cli': 'ia133438.par', '7_Sauk Co WI_CLIGEN.cli': 'wi470516.par', '8_McLean Co ND_CLIGEN.cli': 'nd328872.par', '9_Kearney Co NE_CLIGEN.cli': 'ne255565.par'}
20
      Unnamed: 0  level_0  index stationID_left   long_left   lat_left  \

In [None]:
#None in matches_dct because...
#No GHCNd/CLIGEN match because...

print(matches_dct)

for key in matches_dct:

  if matches_dct[key] != None:
    print(matches_dct[key][:-4])
    print(keep_df.loc[keep_df['stationID_left'] == matches_dct[key][:-4], 'stationID_right'].values)



#inpath = os.path.join(dataFOLDER, keep_df['stationID_right'].loc[keep_df['stationID_left'] == matches_dct[key]])
#outpath = os.path.join(shareFOLDER, )

outFILE = '/content/drive/My Drive/Colab Notebooks/TeamProjects/TwentyFourStations.csv'
extrapars_list = ['nv264436', 'ut425733', 'az028619', 'nm290234']

#print list of closest par stations to ghcn stations:
with open(outFILE, 'w') as fo:
  fo.write('stationID,x,y\n')
  for key in matches_dct:
    par = matches_dct[key]
    with open(os.path.join(parFOLDER, par)) as parf:
      next(parf)
      line = parf.readline()
      row = line.split('=')
      lat = row[1].split()[0]
      lon = row[2].split()[0]
    fo.write(','.join([par[:-4], lon, lat]) + '\n')

  for par in extrapars_list:
    par = par + '.par'
    with open(os.path.join(parFOLDER, par)) as parf:
      next(parf)
      line = parf.readline()
      row = line.split('=')
      lat = row[1].split()[0]
      lon = row[2].split()[0]
    fo.write(','.join([par[:-4], lon, lat]) + '\n')




{'10_St. Clair Co IL_CLIGEN.cli': 'il110510.par', '11_Washington Co AR_CLIGEN.cli': 'ar032443.par', '12_Crosby Co TX_CLIGEN.cli': 'tx412121.par', '13_Pratt Co KS_CLIGEN.cli': 'ks146549.par', '14_Washington Co CO_CLIGEN.cli': 'co050114.par', '15_Prairie Co MT_CLIGEN.cli': 'mt248169.par', '16_Maricopa Co AZ_CLIGEN.cli': 'az024829.par', '17_Fresno Co CA_CLIGEN.cli': 'ca040449.par', '18_Jerome Co ID_CLIGEN.cli': 'id104670.par', '19_Adams Co WA_CLIGEN.cli': 'wa454679.par', '1_Steuben Co NY_CLIGEN.cli': 'ny300448.par', '20_Linn Co OR_CLIGEN.cli': 'or354811.par', '2_Randolf Co NC_CLIGEN.cli': 'nc310286.par', '3_Jackson Co FL_CLIGEN.cli': 'fl081544.par', '4_Putnam Co OH_CLIGEN.cli': 'oh336405.par', '5_Bolivar Co MS_CLIGEN.cli': 'ms221707.par', '6_Adair Co IA_CLIGEN.cli': 'ia133438.par', '7_Sauk Co WI_CLIGEN.cli': 'wi470516.par', '8_McLean Co ND_CLIGEN.cli': 'nd328872.par', '9_Kearney Co NE_CLIGEN.cli': 'ne255565.par'}
il110510
[]
ar032443
['USW00093993']
tx412121
[]
ks146549
[]
co050114
[]
mt2

In [None]:

import pandas as pd
import numpy as np
import requests
from datetime import datetime as dt
from datetime import timedelta
import os
import scipy as scipy
import statsmodels.api as sm
from google.colab import data_table
data_table.enable_dataframe_formatter()
import csv
import geopandas as gpd
from geopy.distance import geodesic


stationIDLINK = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
metadataLINK = 'https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt'
dataLINK = 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access'
coordFILE = '/content/drive/My Drive/Colab Notebooks/TeamProjects/TwentyFourStations.csv'
keepFILE = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Keepers.csv'

max_dist = 100000.0 #meters
percent_complete = 95.0 #%

dates = []
start_date = dt(1974, 1, 1, 0, 0)
end_date = dt(2013, 12, 31, 0, 0)
date = start_date
while date <= end_date:
  dates.append(date)
  date = date + timedelta(days=1)

stations, longitudes, latitudes = [], [], []
with open(coordFILE) as f:
  next(f)
  for line in f:
    row = line.strip('\n').split(',')
    stations.append(row[0])
    longitudes.append(row[1])
    latitudes.append(row[2])

cli_df = pd.DataFrame(data=zip(stations, longitudes, latitudes), columns=['stationID', 'long', 'lat'])

url = stationIDLINK
req = requests.get(url)
text = req.text

stations, longitudes, latitudes = [], [], []
lines = (line for line in text.splitlines())
for line in lines:
  row = line.split()
  stationID = row[0]
  if stationID[:2] == 'US':
    stations.append(row[0])
    latitudes.append(row[1])
    longitudes.append(row[2])

gnd_df = pd.DataFrame(data=zip(stations, longitudes, latitudes), columns=['stationID', 'long', 'lat'])

#EPSG:4326 geodetic coordinates -> 'EPSG:3857' meters
cli_gdf = gpd.GeoDataFrame(
  cli_df, geometry=gpd.points_from_xy(cli_df['long'], cli_df['lat']), crs='EPSG:3857'
)

gnd_gdf = gpd.GeoDataFrame(
  gnd_df, geometry=gpd.points_from_xy(gnd_df['long'], gnd_df['lat']), crs='EPSG:3857'
)

join_gdf = gpd.sjoin_nearest(cli_gdf, gnd_gdf, how='left', distance_col='dist')
join_gdf.to_crs('EPSG:3857')

join_df = pd.DataFrame(join_gdf.drop(columns=['geometry', 'index_right']))
join_df['dist_m'] = join_df.apply(lambda x: geodesic((x['lat_left'], x['long_left']), (x['lat_right'], x['long_right'])).meters, axis=1)

keepers_one_df = join_df[join_df['dist_m'] < max_dist].reset_index()

keepers_step_one = keepers_one_df['stationID_right'].values

url = metadataLINK
req = requests.get(url)
text = req.text

"""
keepers_step_two = []
lines = (line for line in text.splitlines())
for line in lines:
  row = line.split()
  if 'PRCP' in row and row[0] in keepers_step_one:
    if int(row[4]) <= 1974 and int(row[5]) >= 2013:
      keepers_step_two.append(row[0])

print('length keepers_step_two: ', len(keepers_step_two))
"""

keepers_step_two = keepers_step_one

#No html address
bad = []
for keeper in keepers_step_two:

  ct = 0
  url = dataLINK + '/' + keeper + '.csv'
  req = requests.get(url)
  text = req.text

  lines = [line for line in text.splitlines()]

  save_lines = []
  hdrs = lines[0].split(',')
  prcp_i = hdrs.index('"PRCP"')
  date_i = hdrs.index('"DATE"')
  for line in lines[1:]:
    row = line.split('","')
    name_no_comma = row[5].replace(',', '')
    line = line.replace(row[5], name_no_comma)
    line = line.replace('","', ',')
    row = line.split(',')
    date = dt.strptime(row[date_i].strip('"'), '%Y-%m-%d')
    prcp = row[prcp_i].strip('"')
    if date.year >= 1974 and date.year <= 2013:
      if prcp != '' and not any([s in prcp for s in ['P', 'T', 'H', '9999']]):
        prcp = float(prcp)
        ct += 1

  error_per = float(ct)/float(len(dates))*100
  join_df.loc[join_df['stationID_right'] == keeper, 'complete_per'] = error_per
  if float(ct)/float(len(dates))*100. < percent_complete:
    pass

join_df.index.name = 'index'
join_df.to_csv('/content/drive/My Drive/Colab Notebooks/TeamProjects/read_out.csv')


dataLINK = 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access'
outFOLDER = '/content/drive/My Drive/Colab Notebooks/TeamProjects/Share'
for row in join_df.loc[join_df['complete_per'] >= 95.0].iterrows():

  keeper = row[1]['stationID_right']
  dates = []
  start_date = dt(1974, 1, 1, 0, 0)
  end_date = dt(2013, 12, 31, 0, 0)
  date = start_date
  while date <= end_date:
    dates.append(date)
    date = date + timedelta(days=1)

  df = pd.DataFrame()
  df.index.name = 'index'
  df['year'] = [date.year for date in dates]
  df['month'] = [date.month for date in dates]
  df['day'] = [date.day for date in dates]
  df[keeper] = [0.0 for date in dates]

  outFILE = os.path.join(outFOLDER, keeper + '.csv')
  url = dataLINK + '/' + keeper + '.csv'
  req = requests.get(url)
  text = req.text
  if not '404 Not Found' in text:

    lines = [line for line in text.splitlines()]
    hdrs = lines[0].split(',')
    prcp_col_i = hdrs.index('"PRCP"')
    date_col_i = hdrs.index('"DATE"')

    for line in lines[1:]:
      row = line.split('","')
      name_no_comma = row[5].replace(',', '')
      line = line.replace(row[5], name_no_comma)
      line = line.replace('","', ',')
      row = line.split(',')
      lon = row[3]
      lat = row[2]
      date = dt.strptime(row[date_col_i].strip('"'), '%Y-%m-%d')
      prcp = row[prcp_col_i].strip('"')

      if date.year >= 1974 and date.year <= 2013:

        if prcp != '' and not any([s in prcp for s in ['P', 'T', 'H', '9999']]):
          prcp = float(prcp)/10.0
          df.loc[(df['year'] == date.year) & (df['month']==date.month) & (df['day']==date.day), keeper] = prcp

  df.to_csv(outFILE)


