In [11]:
from readdata import create_df

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score

import numpy as np
import pandas as pd

In [4]:
# Dataframe creation, included other locations as options
# OG Datasets

CLL = create_df('CLL') # College Station
#AUS = create_df('AUS') # Austin
#DFW = create_df('DFW') # Dallas
#IAH = create_df('IAH') # Houston

In [5]:
# Suggested data mod 1 - Remove 2nd and 3rd Cloud Heights from data frame

# Reasoning: Precipiation must fall from a cloud (straitform or cumuliform) and these clouds are usually the lowest ones
# detected by the ceilometer as they are dense/thick/and filled with soon to be falling precipitation. Would stand to reason
# the other two cloud heights detected are repeats of the same layer and add no benefit but repitition of the same cell.

CLL_mod1 = CLL[['Temp [F]','DP [F]','RH [%]','W Dir [Deg]','W Spd [Kts]','Alt [inHg]','1Hr-Prcp [mm]',
              'Vis [mi]','Cld Hgt1 [Ft]']]
CLL_mod1

Unnamed: 0_level_0,Temp [F],DP [F],RH [%],W Dir [Deg],W Spd [Kts],Alt [inHg],1Hr-Prcp [mm],Vis [mi],Cld Hgt1 [Ft]
Date [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-01,61.630000,48.920000,64.916111,108.888889,6.666667,30.008333,,9.944444,2025.000000
2000-01-02,71.223125,63.213125,76.888750,181.000000,9.906250,29.921875,,10.000000,2553.333333
2000-01-03,69.756552,46.933793,53.424828,228.518519,9.758621,29.839655,0.000100,9.862069,3090.476190
2000-01-04,47.978462,20.473077,37.178077,316.666667,9.730769,30.305000,,9.576923,2550.000000
2000-01-05,36.455000,20.367500,58.012083,83.333333,4.291667,30.352083,,10.000000,
...,...,...,...,...,...,...,...,...,...
2019-12-26,61.800000,55.786667,82.287000,115.632911,4.313291,30.018522,0.000000,6.517013,1571.723005
2019-12-27,62.773333,56.513333,81.028333,80.314465,4.566038,30.057893,0.000000,8.410377,2811.154930
2019-12-28,67.683333,63.547222,87.050556,148.495298,9.901235,29.838981,0.214933,7.794753,2070.902527
2019-12-29,60.873077,49.715385,70.033462,229.901961,7.280255,29.825446,0.114548,9.955414,5148.046512


In [116]:
# Suggested data mod 2 - Remove all Cloud Heights from data frame

# Reasoning: Magnitude difference is fairly severe in comparison, however, I do think it is important to capture data points
# from the cloud/layer in some way and this is the last direct data point from clouds as the present weather and sky cover
# string data points are already removed.

CLL_mod2 = CLL[['Temp [F]','DP [F]','RH [%]','W Dir [Deg]','W Spd [Kts]','Alt [inHg]','1Hr-Prcp [mm]',
              'Vis [mi]']]
CLL_mod2

Unnamed: 0_level_0,Temp [F],DP [F],RH [%],W Dir [Deg],W Spd [Kts],Alt [inHg],1Hr-Prcp [mm],Vis [mi]
Date [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-01,61.630000,48.920000,64.916111,108.888889,6.666667,30.008333,,9.944444
2000-01-02,71.223125,63.213125,76.888750,181.000000,9.906250,29.921875,,10.000000
2000-01-03,69.756552,46.933793,53.424828,228.518519,9.758621,29.839655,0.000100,9.862069
2000-01-04,47.978462,20.473077,37.178077,316.666667,9.730769,30.305000,,9.576923
2000-01-05,36.455000,20.367500,58.012083,83.333333,4.291667,30.352083,,10.000000
...,...,...,...,...,...,...,...,...
2019-12-26,61.800000,55.786667,82.287000,115.632911,4.313291,30.018522,0.000000,6.517013
2019-12-27,62.773333,56.513333,81.028333,80.314465,4.566038,30.057893,0.000000,8.410377
2019-12-28,67.683333,63.547222,87.050556,148.495298,9.901235,29.838981,0.214933,7.794753
2019-12-29,60.873077,49.715385,70.033462,229.901961,7.280255,29.825446,0.114548,9.955414


- Replace NaN values by yearly mean values : Similar to Dylan's idea ,but I took the yearly average for each column.

In [222]:
#Substitude NaN values to Yearly mean
df=CLL_mod2
CLL_mod4=df.groupby([df.index.year]).transform(lambda x: x.fillna(x.mean()))
CLL_mod4.head()

Unnamed: 0_level_0,Temp [F],DP [F],RH [%],W Dir [Deg],W Spd [Kts],Alt [inHg],1Hr-Prcp [mm],Vis [mi]
Date [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-01,61.63,48.92,64.916111,108.888889,6.666667,30.008333,0.835972,9.944444
2000-01-02,71.223125,63.213125,76.88875,181.0,9.90625,29.921875,0.835972,10.0
2000-01-03,69.756552,46.933793,53.424828,228.518519,9.758621,29.839655,0.0001,9.862069
2000-01-04,47.978462,20.473077,37.178077,316.666667,9.730769,30.305,0.835972,9.576923
2000-01-05,36.455,20.3675,58.012083,83.333333,4.291667,30.352083,0.835972,10.0


In [230]:
X_mod4 = normalize(CLL_mod4[:-1].values)
y = CLL_mod4['1Hr-Prcp [mm]'][1:].values
X_train, X_test, y_train, y_test = train_test_split(X_mod4, y)

reg = linear_model.LinearRegression()
fit = reg.fit(X_train, y_train)

# This is what we want to predict
y = CLL_mod4['1Hr-Prcp [mm]'][1:].values
y_predict = fit.predict(X_test)
y_persist = CLL_mod4['1Hr-Prcp [mm]'][:-1].values

print(r2_score(y_test, y_predict))
print(r2_score(y, y_persist))

0.1544160618451248
-0.3179680114833907
