# Preliminary Data Analysis - Sample from Ace Basin

Exploration of the first region in the NOAA dataset. We wanted to get a feel of how the data was organized and structured before moving on to other regions.

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

sc = SparkContext('local')
spark = SparkSession(sc)

## Meteorological Data for Bennett's Point, Ace Basin 2004

In [2]:
data_sample = "./input/raw/North Carolina/meteorological/data/Bennett's Point/"

aceb04_data = spark.read.csv(data_sample + "acebpmet2004.txt",header=True)
dropped_cols = [f for f in aceb04_data.columns if f[0] == 'F']
aceb04_data = aceb04_data.drop(*dropped_cols)

split_col = F.split(aceb04_data['DateTimeStamp'], '\ ')
aceb04_data = aceb04_data.withColumn('SMPLDATE', split_col.getItem(0)).withColumn('SMPLTIME', split_col.getItem(1))
aceb04_data = aceb04_data.drop('DateTimeStamp')

### Dimensions of 2004 data

In [3]:
print((aceb04_data.count(), len(aceb04_data.columns)))

(43924, 28)


## Meteorological Data for Bennett's Point, Ace Basin 2005

In [4]:
aceb05_data = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(data_sample + "acebpmet2005.txt")
aceb05_data = aceb05_data.drop(*['STNCODE', 'USRCODES'])

### Dimensions of 2005 data

In [5]:
print((aceb05_data.count(), len(aceb05_data.columns)))

(44034, 28)


## Meteorological Data for Bennett's Point, Ace Basin 2006

In [6]:
aceb06_data = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(data_sample + "acebpmet2006.txt")
aceb06_data = aceb06_data.drop(*['STNCODE', 'USRCODES'])

### Dimensions of 2006 data

In [7]:
print((aceb06_data.count(), len(aceb06_data.columns)))

(39043, 28)


## Compilation of ALL Meteorological Data for Bennett's Point, Ace Basin from 2004-2006

In [8]:
acebmet_data = aceb06_data.union(aceb04_data).union(aceb05_data)
acebmet_data.toPandas().to_csv("input/clean/agg/AceBasinMeteor_2004-2006.csv")

### Dimensions of All data

In [9]:
print((acebmet_data.count(), len(acebmet_data.columns)))

(127001, 28)


### Checking for Missing Data

In [10]:
from pyspark.sql.functions import isnan, when, count, col
acebmet_data.select([count(when(isnan(c), c)).alias(c) for c in acebmet_data.columns]).show()

+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+
|CLASS|SMPLDATE|SMPLTIME|ATemp|MaxTemp|MaxTempT|MinTemp|MinTempT| RH|MaxRH|MaxRHT|MinRH|MinRHT| BP|MaxBP|MaxBPT|MinBP|MinBPT|WSpd|Wdir|SDWDir|MaxWSpd|MaxWSpdT|MinWSpd|MinWSpdT|TotPrcp|TotPAR|AvgVolt|
+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+
|    0|       0|       0|    0|      0|       0|      0|       0|  0|    0|     0|    0|     0|  0|    0|     0|    0|     0|   0|   0|     0|      0|       0|      0|       0|      0|     0|      0|
+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+


Here, we can see there are no missing values in our data.

### Checking for Duplicate Values

In [11]:
acebmet_data.count()
acebmet_data.distinct().count()

127001

Here, we can see that the number of distinct values is the same as the number of values overall. Thus, we do not have any duplicate values.

## Prototype for automated script to aggregate meterological data - North Carolina

* final cleanup file is in file_cleanup.py

In [2]:
import os
rootdir = './input/raw/North Carolina/'

file_list = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        file_list.append(os.path.join(subdir, file))

### Creating lists of relevant .txt file names, separated into meteorological, nutrient, and water quality data

In [3]:
import re
text = [file for file in file_list if any(txt in file for txt in ['.txt'])]
regex = re.compile(r'.*(Readme).*|.*(checkpoint).*')
text_list = [i for i in text if not regex.match(i)]
text_list

met_list = [file for file in text_list if any(txt in file for txt in ['meteorological'])]
nut_list = [file for file in text_list if any(txt in file for txt in ['nutrient'])]
water_list = [file for file in text_list if any(txt in file for txt in ['water quality'])]

### Separating Meteorological file names by year

In [4]:
met_2004 = [file for file in met_list if any(txt in file for txt in ['2004'])]
met_2005 = [file for file in met_list if any(txt in file for txt in ['2005'])]
met_2006 = [file for file in met_list if any(txt in file for txt in ['2006'])]

### Concatenating Meteorological data of ALL locations for 2005 and 2006

In [32]:
met04_data = spark.read.option("header", "true") \
    .option("delimiter", ",") \
    .option("inferSchema", "true") \
    .csv(met_2004)
# dropping all F_ columns (deemed unnecessary)
dropped_cols = [f for f in met04_data.columns if f[0] == 'F']
met04_data = met04_data.drop(*dropped_cols)
# splitting date and time into two separate columns
split_col = F.split(met04_data['DateTimeStamp'], '\ ')
met04_data = met04_data.withColumn('SMPLDATE', split_col.getItem(0)).withColumn('SMPLTIME', split_col.getItem(1))
met04_data = met04_data.drop(*['DateTimeStamp','USRCODES'])

In [24]:
met05_data = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(met_2005)
met05_data = met05_data.drop(*['USRCODES'])

In [25]:
met06_data = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(met_2006)
met06_data = met06_data.drop(*['USRCODES'])

### Checking for Column matchup

In [17]:
met06_data.columns == met05_data.columns

False

In [18]:
met05_data.select([count(when(isnan(c), c)).alias(c) for c in met05_data.columns]).show()

+-------+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+-------+--------+
|STNCODE|CLASS|SMPLDATE|SMPLTIME|ATemp|MaxTemp|MaxTempT|MinTemp|MinTempT| RH|MaxRH|MaxRHT|MinRH|MinRHT| BP|MaxBP|MaxBPT|MinBP|MinBPT|WSpd|Wdir|SDWDir|MaxWSpd|MaxWSpdT|MinWSpd|MinWSpdT|TotPrcp|TotPAR|AvgVolt|TotSRad|CummRain|
+-------+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+-------+--------+
|      0|    0|       0|       0|    0|      0|       0|      0|       0|  0|    0|     0|    0|     0|  0|    0|     0|    0|     0|   0|   0|     0|      0|       0|      0|       0|      0|     0|      0|      0|       0|
+-------+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+----

In [19]:
met06_data.select([count(when(isnan(c), c)).alias(c) for c in met06_data.columns]).show()

+-------+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+-------+
|STNCODE|CLASS|SMPLDATE|SMPLTIME|ATemp|MaxTemp|MaxTempT|MinTemp|MinTempT| RH|MaxRH|MaxRHT|MinRH|MinRHT| BP|MaxBP|MaxBPT|MinBP|MinBPT|WSpd|Wdir|SDWDir|MaxWSpd|MaxWSpdT|MinWSpd|MinWSpdT|TotPrcp|TotPAR|AvgVolt|DnIrrad|
+-------+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+----+----+------+-------+--------+-------+--------+-------+------+-------+-------+
|      0|    0|       0|       0|    0|      0|       0|      0|       0|  0|    0|     0|    0|     0|  0|    0|     0|    0|     0|   0|   0|     0|      0|       0|      0|       0|      0|     0|      0|      0|
+-------+-----+--------+--------+-----+-------+--------+-------+--------+---+-----+------+-----+------+---+-----+------+-----+------+---

# Daily Aggregation of Data - North Carolina

In [24]:
clean_path = "./input/clean/agg/"

meteor04_df = spark.read.csv(clean_path + "NOAA_NC_meteor_data_2004.csv",header=True, inferSchema=True)
meteor05_df = spark.read.csv(clean_path + "NOAA_NC_meteor_data_2005.csv",header=True, inferSchema=True)
meteor06_df = spark.read.csv(clean_path + "NOAA_NC_meteor_data_2006.csv",header=True, inferSchema=True)

In [25]:
meteor04_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- STNCODE: string (nullable = true)
 |-- CLASS: integer (nullable = true)
 |-- SMPLDATE: string (nullable = true)
 |-- SMPLTIME: string (nullable = true)
 |-- ATemp: double (nullable = true)
 |-- MaxTemp: double (nullable = true)
 |-- MaxTempT: integer (nullable = true)
 |-- MinTemp: double (nullable = true)
 |-- MinTempT: integer (nullable = true)
 |-- RH: integer (nullable = true)
 |-- MaxRH: integer (nullable = true)
 |-- MaxRHT: integer (nullable = true)
 |-- MinRH: integer (nullable = true)
 |-- MinRHT: integer (nullable = true)
 |-- BP: integer (nullable = true)
 |-- MaxBP: integer (nullable = true)
 |-- MaxBPT: integer (nullable = true)
 |-- MinBP: integer (nullable = true)
 |-- MinBPT: integer (nullable = true)
 |-- WSpd: double (nullable = true)
 |-- Wdir: integer (nullable = true)
 |-- SDWDir: integer (nullable = true)
 |-- MaxWSpd: double (nullable = true)
 |-- MaxWSpdT: integer (nullable = true)
 |-- MinWSpd: double (nullable = tr

In [26]:
met04_df = meteor04_df.toPandas()
met04_df_naomit = met04_df.drop(met04_df.columns[0:2], axis = 1).dropna(axis = 0)
met04_df_naomit["date"] = met04_df_naomit["SMPLDATE"]
met04_df_naomit = met04_df_naomit.drop(["SMPLDATE", "SMPLTIME", "STNCODE"], axis = 1)

In [27]:
met04_df_naomit.head()

Unnamed: 0,CLASS,ATemp,MaxTemp,MaxTempT,MinTemp,MinTempT,RH,MaxRH,MaxRHT,MinRH,...,Wdir,SDWDir,MaxWSpd,MaxWSpdT,MinWSpd,MinWSpdT,TotPrcp,TotPAR,AvgVolt,date
0,15,5.7,6.0,0,5.4,13,94,94,14,94,...,290,13,2.6,13,0.6,6,0.0,0.1,12.7,01/01/2004
1,15,5.6,5.8,18,5.4,15,94,95,18,94,...,284,13,3.1,15,0.8,26,0.0,0.1,12.7,01/01/2004
2,15,5.4,5.7,30,5.0,44,94,94,42,94,...,293,13,2.9,35,0.4,30,0.0,0.0,12.7,01/01/2004
3,15,5.1,5.2,45,4.9,48,95,95,53,94,...,292,16,2.4,58,0.6,53,0.0,0.0,12.7,01/01/2004
4,60,5.4,6.0,0,4.9,48,94,95,53,94,...,290,14,3.1,15,0.4,30,0.0,0.1,12.7,01/01/2004


In [28]:
met04_df_naomit = met04_df_naomit[met04_df_naomit.CLASS == 15]
met04_df_naomit = met04_df_naomit.set_index('date')

In [29]:
met04_df_naomit = met04_df_naomit.groupby(level=0).mean()

In [30]:
met04_df_naomit.head()

Unnamed: 0_level_0,CLASS,ATemp,MaxTemp,MaxTempT,MinTemp,MinTempT,RH,MaxRH,MaxRHT,MinRH,...,WSpd,Wdir,SDWDir,MaxWSpd,MaxWSpdT,MinWSpd,MinWSpdT,TotPrcp,TotPAR,AvgVolt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01/01/2004,15.0,8.296875,8.633333,1179.760417,7.971875,1179.260417,71.979167,74.145833,1179.739583,69.71875,...,1.309375,249.760417,17.25,2.392708,1178.854167,0.435417,1179.0,0.0,204.30625,13.016667
01/02/2004,15.0,11.064583,11.410417,1179.791667,10.726042,1179.0625,71.291667,73.479167,1178.84375,68.895833,...,1.738542,249.572917,16.197917,3.071875,1180.0625,0.689583,1178.8125,0.003125,173.65,12.969792
01/03/2004,15.0,15.534375,15.804167,1179.739583,15.269792,1178.666667,71.1875,72.21875,1180.729167,70.21875,...,3.0625,243.15625,12.020833,5.442708,1179.875,1.251042,1179.0625,0.0,205.3875,12.9625
01/04/2004,15.0,19.130208,19.364583,1178.572917,18.907292,1178.3125,74.09375,74.96875,1179.760417,73.239583,...,3.735417,236.166667,12.96875,7.070833,1178.927083,1.435417,1179.9375,0.0,182.803125,12.883333
01/05/2004,15.0,19.667708,19.879167,1180.03125,19.453125,1179.864583,84.5625,85.166667,1179.8125,83.885417,...,5.346875,233.166667,13.65625,10.288542,1179.625,1.967708,1179.84375,0.0,204.194792,12.938542


In [31]:
meteor05_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- STNCODE: string (nullable = true)
 |-- CLASS: integer (nullable = true)
 |-- SMPLDATE: string (nullable = true)
 |-- SMPLTIME: string (nullable = true)
 |-- ATemp: double (nullable = true)
 |-- MaxTemp: double (nullable = true)
 |-- MaxTempT: integer (nullable = true)
 |-- MinTemp: double (nullable = true)
 |-- MinTempT: integer (nullable = true)
 |-- RH: double (nullable = true)
 |-- MaxRH: double (nullable = true)
 |-- MaxRHT: double (nullable = true)
 |-- MinRH: double (nullable = true)
 |-- MinRHT: integer (nullable = true)
 |-- BP: integer (nullable = true)
 |-- MaxBP: integer (nullable = true)
 |-- MaxBPT: integer (nullable = true)
 |-- MinBP: integer (nullable = true)
 |-- MinBPT: integer (nullable = true)
 |-- WSpd: double (nullable = true)
 |-- Wdir: integer (nullable = true)
 |-- SDWDir: integer (nullable = true)
 |-- MaxWSpd: double (nullable = true)
 |-- MaxWSpdT: double (nullable = true)
 |-- MinWSpd: double (nullable = true)
 

In [36]:
met05_df = meteor05_df.toPandas()
met05_df_naomit = met05_df.drop(met04_df.columns[0:2], axis = 1).dropna(axis = 0)
met05_df_naomit["date"] = met05_df_naomit["SMPLDATE"]
met05_df_naomit = met05_df_naomit.drop(["SMPLDATE", "SMPLTIME"], axis = 1)
met05_df_naomit = met05_df_naomit[met05_df_naomit.CLASS == 15]
met05_df_naomit = met05_df_naomit.set_index('date')
met05_df_naomit = met05_df_naomit.groupby(level=0).mean().drop("CLASS", axis = 1)

In [37]:
met05_df_naomit.describe()

Unnamed: 0,ATemp,MaxTemp,MaxTempT,MinTemp,MinTempT,RH,MaxRH,MaxRHT,MinRH,MinRHT,...,WSpd,Wdir,SDWDir,MaxWSpd,MaxWSpdT,MinWSpd,MinWSpdT,TotPrcp,TotPAR,AvgVolt
count,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0,...,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0,361.0
mean,17.721416,17.965875,1178.550007,17.479769,1178.864979,78.03424,79.547544,1178.821993,76.485369,1178.898034,...,3.270213,199.683461,17.854176,5.810246,1179.014021,1.208372,1179.055004,0.041856,300.061046,12.823007
std,8.064448,8.059253,78.819196,8.064738,78.826499,12.361845,11.781601,75.120701,12.914855,78.528184,...,1.693977,68.069359,7.550239,2.577781,78.901984,0.988666,78.709779,0.129167,139.617703,0.197183
min,-3.736458,-3.4875,44.2,-3.994792,44.8,37.708333,39.71875,42.4,35.791667,53.0,...,0.66,35.25,6.9375,1.36,42.6,0.03125,47.8,0.0,-1.578947,11.7
25%,11.589583,11.83125,1178.260417,11.354167,1178.552083,71.864583,73.802083,1179.614583,69.78125,1178.53125,...,2.103125,153.791667,13.602273,3.9625,1179.125,0.583333,1178.885417,0.0,192.057292,12.790625
50%,18.590625,18.838542,1178.90625,18.3625,1179.15625,80.90625,82.364583,1180.229167,79.3125,1179.09375,...,2.778125,187.3125,16.15625,5.310417,1179.395833,0.907292,1179.3125,0.0,294.063542,12.864583
75%,25.294737,25.526042,1179.416667,25.057292,1179.78125,87.489583,88.6,1180.770833,86.375,1179.677083,...,3.927083,258.0,20.239583,7.003125,1179.697917,1.552083,1179.708333,0.009375,415.894792,12.946875
max,30.705208,30.934375,2139.894737,30.455208,2140.736842,96.135417,96.520833,2020.263158,95.708333,2141.789474,...,12.264444,331.770833,61.052083,21.444444,2140.0,6.365556,2141.263158,1.244444,550.383158,13.096875


In [38]:
met06_df = meteor06_df.toPandas()
met06_df_naomit = met06_df.drop(met06_df.columns[0:2], axis = 1).dropna(axis = 0)
met06_df_naomit["date"] = met06_df_naomit["SMPLDATE"]
met06_df_naomit = met06_df_naomit.drop(["SMPLDATE", "SMPLTIME"], axis = 1)
met06_df_naomit = met06_df_naomit[met06_df_naomit.CLASS == 15]
met06_df_naomit = met06_df_naomit.set_index('date')
met06_df_naomit = met06_df_naomit.groupby(level=0).mean().drop("CLASS", axis = 1)

In [47]:
met06_df.isna().sum()

_c0            0
STNCODE        0
CLASS          0
SMPLDATE       0
SMPLTIME       0
ATemp          0
MaxTemp        0
MaxTempT       0
MinTemp        0
MinTempT       0
RH             0
MaxRH       7811
MaxRHT      7811
MinRH       7811
MinRHT      7811
BP             0
MaxBP       7811
MaxBPT      7811
MinBP       7811
MinBPT      7811
WSpd          88
Wdir          87
SDWDir        87
MaxWSpd      131
MaxWSpdT      88
MinWSpd     7898
MinWSpdT    7898
TotPrcp     5832
TotPAR      2262
AvgVolt        0
dtype: int64

In [48]:
met06_df_naomit.describe()

Unnamed: 0,ATemp,MaxTemp,MaxTempT,MinTemp,MinTempT,RH,MaxRH,MaxRHT,MinRH,MinRHT,...,WSpd,Wdir,SDWDir,MaxWSpd,MaxWSpdT,MinWSpd,MinWSpdT,TotPrcp,TotPAR,AvgVolt
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,...,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,15.331398,15.597459,1184.906652,15.072535,1183.784626,73.89505,75.565966,1184.569439,72.181436,1185.034518,...,2.791495,197.542034,21.762936,5.35962,1183.93471,0.88875,1184.426138,0.02892,341.5143,12.947073
std,6.52148,6.510154,107.193317,6.529762,108.232853,15.030706,14.361107,107.404251,15.69362,108.131289,...,1.188773,47.991089,9.634768,2.058636,108.138501,0.572011,108.187321,0.111226,170.025526,0.179187
min,2.582222,2.886667,327.535714,2.26,329.535714,31.697917,34.25,329.321429,29.114583,328.75,...,0.496429,83.927083,9.46875,1.3,329.785714,0.033684,327.214286,0.0,23.642857,12.460714
25%,9.896212,10.132955,1173.135417,9.662216,1174.036458,65.22653,67.300403,1174.041667,62.519677,1173.677083,...,1.849479,158.661458,14.836015,3.756771,1174.375,0.482292,1174.828125,0.0,217.207292,12.863355
50%,15.018182,15.325,1179.041667,14.720455,1179.229167,75.347368,76.822917,1180.333333,73.5,1179.53125,...,2.611458,193.927083,18.464286,5.116842,1179.510417,0.784375,1179.375,0.0,326.125263,12.89375
75%,20.705208,20.960938,1196.84375,20.431771,1198.661458,84.942708,86.203125,1198.552083,83.75,1197.854167,...,3.45625,223.161458,25.161458,6.452652,1198.130208,1.227083,1198.515625,0.003125,447.865552,12.990868
max,28.85,29.075,1558.84127,28.646875,1563.52381,99.28125,99.552083,1564.412698,99.104167,1558.126984,...,6.5375,331.957143,55.905263,11.67619,1562.539683,3.169792,1561.920635,1.148958,807.694118,13.535556


In [51]:
import pandas as pd
concat = [met04_df_naomit,met05_df_naomit, met06_df_naomit]
pd.concat(concat, axis = 0).drop("CLASS",axis = 1).to_csv(clean_path + "NOAA_NC_04_06_DAvg.csv")

Unnamed: 0_level_0,ATemp,MaxTemp,MaxTempT,MinTemp,MinTempT,RH,MaxRH,MaxRHT,MinRH,MinRHT,...,WSpd,Wdir,SDWDir,MaxWSpd,MaxWSpdT,MinWSpd,MinWSpdT,TotPrcp,TotPAR,AvgVolt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01/01/2004,8.296875,8.633333,1179.760417,7.971875,1179.260417,71.979167,74.145833,1179.739583,69.718750,1179.635417,...,1.309375,249.760417,17.250000,2.392708,1178.854167,0.435417,1179.000000,0.000000,204.306250,13.016667
01/02/2004,11.064583,11.410417,1179.791667,10.726042,1179.062500,71.291667,73.479167,1178.843750,68.895833,1180.656250,...,1.738542,249.572917,16.197917,3.071875,1180.062500,0.689583,1178.812500,0.003125,173.650000,12.969792
01/03/2004,15.534375,15.804167,1179.739583,15.269792,1178.666667,71.187500,72.218750,1180.729167,70.218750,1178.239583,...,3.062500,243.156250,12.020833,5.442708,1179.875000,1.251042,1179.062500,0.000000,205.387500,12.962500
01/04/2004,19.130208,19.364583,1178.572917,18.907292,1178.312500,74.093750,74.968750,1179.760417,73.239583,1179.531250,...,3.735417,236.166667,12.968750,7.070833,1178.927083,1.435417,1179.937500,0.000000,182.803125,12.883333
01/05/2004,19.667708,19.879167,1180.031250,19.453125,1179.864583,84.562500,85.166667,1179.812500,83.885417,1179.968750,...,5.346875,233.166667,13.656250,10.288542,1179.625000,1.967708,1179.843750,0.000000,204.194792,12.938542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10/07/2006,15.405208,15.645833,1196.468750,15.179167,1174.416667,81.364583,82.760417,1200.052083,79.927083,1172.197917,...,1.444792,283.281250,48.625000,3.889583,1197.166667,0.132292,1197.041667,0.000000,272.217708,12.862500
10/08/2006,20.536458,20.767708,1174.510417,20.289583,1194.750000,90.041667,91.468750,1176.041667,88.541667,1196.260417,...,5.857292,159.114583,17.531250,9.343750,1173.104167,2.856250,1200.187500,0.445833,141.280208,12.770833
10/09/2006,20.205208,20.440625,1171.625000,19.989583,1173.062500,91.114583,92.166667,1173.906250,90.031250,1194.114583,...,3.545833,288.104167,19.895833,6.727083,1197.937500,1.109375,1173.093750,0.278125,321.535417,12.838542
10/10/2006,21.179167,21.403125,1194.677083,20.956250,1174.843750,90.000000,90.927083,1174.791667,89.041667,1196.760417,...,2.318750,251.906250,22.041667,4.506250,1174.291667,0.673958,1170.604167,0.000000,271.919792,12.852083


In [57]:
water04_df = spark.read.csv(clean_path + "NOAA_NC_water_data_2004.csv",header=True, inferSchema=True)
water05_df = spark.read.csv(clean_path + "NOAA_NC_water_data_2005.csv",header=True, inferSchema=True)
water06_df = spark.read.csv(clean_path + "NOAA_NC_water_data_2006.csv",header=True, inferSchema=True)
water04_df = water04_df.toPandas()[["SMPLDATE","Temp"]]
water05_df = water05_df.toPandas()[["SMPLDATE","Temp"]]
water06_df = water06_df.toPandas()[["SMPLDATE","Temp"]]

In [58]:
water04_df_naomit = water04_df.dropna(axis = 0)
water04_df_naomit["date"] = water04_df_naomit["SMPLDATE"]
water04_df_naomit = water04_df_naomit.set_index('date')
water04_df_naomit = water04_df_naomit.groupby(level=0).mean()

In [60]:
water05_df_naomit = water05_df.dropna(axis = 0)
water05_df_naomit["date"] = water05_df_naomit["SMPLDATE"]
water05_df_naomit = water05_df_naomit.set_index('date')
water05_df_naomit = water05_df_naomit.groupby(level=0).mean()

In [61]:
water06_df_naomit = water06_df.dropna(axis = 0)
water06_df_naomit["date"] = water06_df_naomit["SMPLDATE"]
water06_df_naomit = water06_df_naomit.set_index('date')
water06_df_naomit = water06_df_naomit.groupby(level=0).mean()

In [67]:
pd.concat([met04_df_naomit.join(water04_df_naomit), met05_df_naomit.join(water05_df_naomit), met06_df_naomit.join(water06_df_naomit)], axis = 0).to_csv(clean_path + "NOAA_NC_DAvg_training_data.csv")

Unnamed: 0_level_0,CLASS,ATemp,MaxTemp,MaxTempT,MinTemp,MinTempT,RH,MaxRH,MaxRHT,MinRH,...,Wdir,SDWDir,MaxWSpd,MaxWSpdT,MinWSpd,MinWSpdT,TotPrcp,TotPAR,AvgVolt,Temp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01/01/2004,15.0,8.296875,8.633333,1179.760417,7.971875,1179.260417,71.979167,74.145833,1179.739583,69.718750,...,249.760417,17.250000,2.392708,1178.854167,0.435417,1179.000000,0.000000,204.306250,13.016667,10.641146
01/02/2004,15.0,11.064583,11.410417,1179.791667,10.726042,1179.062500,71.291667,73.479167,1178.843750,68.895833,...,249.572917,16.197917,3.071875,1180.062500,0.689583,1178.812500,0.003125,173.650000,12.969792,11.089062
01/03/2004,15.0,15.534375,15.804167,1179.739583,15.269792,1178.666667,71.187500,72.218750,1180.729167,70.218750,...,243.156250,12.020833,5.442708,1179.875000,1.251042,1179.062500,0.000000,205.387500,12.962500,12.167708
01/04/2004,15.0,19.130208,19.364583,1178.572917,18.907292,1178.312500,74.093750,74.968750,1179.760417,73.239583,...,236.166667,12.968750,7.070833,1178.927083,1.435417,1179.937500,0.000000,182.803125,12.883333,13.960937
01/05/2004,15.0,19.667708,19.879167,1180.031250,19.453125,1179.864583,84.562500,85.166667,1179.812500,83.885417,...,233.166667,13.656250,10.288542,1179.625000,1.967708,1179.843750,0.000000,204.194792,12.938542,15.548437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10/07/2006,,15.405208,15.645833,1196.468750,15.179167,1174.416667,81.364583,82.760417,1200.052083,79.927083,...,283.281250,48.625000,3.889583,1197.166667,0.132292,1197.041667,0.000000,272.217708,12.862500,22.306250
10/08/2006,,20.536458,20.767708,1174.510417,20.289583,1194.750000,90.041667,91.468750,1176.041667,88.541667,...,159.114583,17.531250,9.343750,1173.104167,2.856250,1200.187500,0.445833,141.280208,12.770833,22.157552
10/09/2006,,20.205208,20.440625,1171.625000,19.989583,1173.062500,91.114583,92.166667,1173.906250,90.031250,...,288.104167,19.895833,6.727083,1197.937500,1.109375,1173.093750,0.278125,321.535417,12.838542,22.305469
10/10/2006,,21.179167,21.403125,1194.677083,20.956250,1174.843750,90.000000,90.927083,1174.791667,89.041667,...,251.906250,22.041667,4.506250,1174.291667,0.673958,1170.604167,0.000000,271.919792,12.852083,22.800000
