In [1]:
from readdata import create_df

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score

In [2]:
# Dataframe creation, included other locations as options
# OG Datasets

CLL = create_df('CLL') # College Station
#AUS = create_df('AUS') # Austin
#DFW = create_df('DFW') # Dallas
#IAH = create_df('IAH') # Houston

CLL

Unnamed: 0_level_0,Temp [F],DP [F],RH [%],W Dir [Deg],W Spd [Kts],Alt [inHg],1Hr-Prcp [mm],Vis [mi],Cld Hgt1 [Ft],Cld Hgt2 [Ft],Cld Hgt3 [Ft]
Date [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-01-01,61.630000,48.920000,64.916111,108.888889,6.666667,30.008333,,9.944444,2025.000000,,
2000-01-02,71.223125,63.213125,76.888750,181.000000,9.906250,29.921875,,10.000000,2553.333333,3029.166667,3650.000000
2000-01-03,69.756552,46.933793,53.424828,228.518519,9.758621,29.839655,0.000100,9.862069,3090.476190,3558.823529,4733.333333
2000-01-04,47.978462,20.473077,37.178077,316.666667,9.730769,30.305000,,9.576923,2550.000000,,
2000-01-05,36.455000,20.367500,58.012083,83.333333,4.291667,30.352083,,10.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...
2019-12-26,61.800000,55.786667,82.287000,115.632911,4.313291,30.018522,0.000000,6.517013,1571.723005,4541.868421,5037.500000
2019-12-27,62.773333,56.513333,81.028333,80.314465,4.566038,30.057893,0.000000,8.410377,2811.154930,2854.716667,3899.888889
2019-12-28,67.683333,63.547222,87.050556,148.495298,9.901235,29.838981,0.214933,7.794753,2070.902527,4642.465347,8727.659574
2019-12-29,60.873077,49.715385,70.033462,229.901961,7.280255,29.825446,0.114548,9.955414,5148.046512,5389.966667,4128.571429


In [3]:
# Suggested data mod 1 - Remove 2nd and 3rd Cloud Heights from data frame

# Reasoning: Precipiation must fall from a cloud (straitform or cumuliform) and these clouds are usually the lowest ones
# detected by the ceilometer as they are dense/thick/and filled with soon to be falling precipitation. Would stand to reason
# the other two cloud heights detected are repeats of the same layer and add no benefit but repitition of the same cell.

CLL_mod1 = CLL[['Temp [F]','DP [F]','RH [%]','W Dir [Deg]','W Spd [Kts]','Alt [inHg]','1Hr-Prcp [mm]',
              'Vis [mi]','Cld Hgt1 [Ft]']]
CLL_mod1

Unnamed: 0_level_0,Temp [F],DP [F],RH [%],W Dir [Deg],W Spd [Kts],Alt [inHg],1Hr-Prcp [mm],Vis [mi],Cld Hgt1 [Ft]
Date [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-01,61.630000,48.920000,64.916111,108.888889,6.666667,30.008333,,9.944444,2025.000000
2000-01-02,71.223125,63.213125,76.888750,181.000000,9.906250,29.921875,,10.000000,2553.333333
2000-01-03,69.756552,46.933793,53.424828,228.518519,9.758621,29.839655,0.000100,9.862069,3090.476190
2000-01-04,47.978462,20.473077,37.178077,316.666667,9.730769,30.305000,,9.576923,2550.000000
2000-01-05,36.455000,20.367500,58.012083,83.333333,4.291667,30.352083,,10.000000,
...,...,...,...,...,...,...,...,...,...
2019-12-26,61.800000,55.786667,82.287000,115.632911,4.313291,30.018522,0.000000,6.517013,1571.723005
2019-12-27,62.773333,56.513333,81.028333,80.314465,4.566038,30.057893,0.000000,8.410377,2811.154930
2019-12-28,67.683333,63.547222,87.050556,148.495298,9.901235,29.838981,0.214933,7.794753,2070.902527
2019-12-29,60.873077,49.715385,70.033462,229.901961,7.280255,29.825446,0.114548,9.955414,5148.046512


In [25]:
# Suggested data mod 2 - Remove all Cloud Heights from data frame

# Reasoning: Magnitude difference is fairly severe in comparison, however, I do think it is important to capture data points
# from the cloud/layer in some way and this is the last direct data point from clouds as the present weather and sky cover
# string data points are already removed.

CLL_mod2 = CLL[['Temp [F]','DP [F]','RH [%]','W Dir [Deg]','W Spd [Kts]','Alt [inHg]','1Hr-Prcp [mm]',
              'Vis [mi]']]
CLL_mod2

Unnamed: 0_level_0,Temp [F],DP [F],RH [%],W Dir [Deg],W Spd [Kts],Alt [inHg],1Hr-Prcp [mm],Vis [mi]
Date [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-01,61.630000,48.920000,64.916111,108.888889,6.666667,30.008333,0.000000,9.944444
2000-01-02,71.223125,63.213125,76.888750,181.000000,9.906250,29.921875,0.000000,10.000000
2000-01-03,69.756552,46.933793,53.424828,228.518519,9.758621,29.839655,0.000100,9.862069
2000-01-04,47.978462,20.473077,37.178077,316.666667,9.730769,30.305000,0.000000,9.576923
2000-01-05,36.455000,20.367500,58.012083,83.333333,4.291667,30.352083,0.000000,10.000000
...,...,...,...,...,...,...,...,...
2019-12-26,61.800000,55.786667,82.287000,115.632911,4.313291,30.018522,0.000000,6.517013
2019-12-27,62.773333,56.513333,81.028333,80.314465,4.566038,30.057893,0.000000,8.410377
2019-12-28,67.683333,63.547222,87.050556,148.495298,9.901235,29.838981,0.214933,7.794753
2019-12-29,60.873077,49.715385,70.033462,229.901961,7.280255,29.825446,0.114548,9.955414


In [5]:
# Replaces NaNs in all datasets
CLL = CLL.fillna(0.0)
CLL_mod1 = CLL_mod1.fillna(0.0)
CLL_mod2 = CLL_mod2.fillna(0.0)

# Get data for 'today' (exclude last data point), and normalize
# this is the input data
X = normalize(CLL[:-1].values)
X_mod1 = normalize(CLL_mod1[:-1].values)
X_mod2 = normalize(CLL_mod2[:-1].values)

# Get precipitation data for 'tomorrow'
# This is what we want to predict
y = CLL['1Hr-Prcp [mm]'][1:].values

# Create a prediction based on persistance
# (Rain today is predicted to rain tomorrow)
y_persist = CLL['1Hr-Prcp [mm]'][:-1].values

In [19]:
# Testing

#X_train, X_test, y_train, y_test = train_test_split(X, y)
#X_train, X_test, y_train, y_test = train_test_split(X_mod1, y)
X_train, X_test, y_train, y_test = train_test_split(X_mod2, y)

In [20]:
# Regression

reg = linear_model.LinearRegression()
fit = reg.fit(X_train, y_train)

In [21]:
# Manual COEF review

fit.coef_

array([-4.02395626,  6.01084067,  0.25424643,  0.65615347,  7.79478   ,
        2.61220925, 14.00509821, -0.3676092 ])

In [22]:
# Prediction

y_predict = fit.predict(X_test)

In [10]:
y.shape

(7303,)

In [11]:
import numpy as np
y_rand = np.random.rand(7303)

In [12]:
# Evaluation - OG Dataset

print(r2_score(y_test, y_predict))
print(r2_score(y, y_persist))
print(r2_score(y, y_rand))

0.011585556813760278
-0.8025923866275462
-0.04055385703434533


In [18]:
# Evaluation - Mod 1 Dataset

print(r2_score(y_test, y_predict))
print(r2_score(y, y_persist))
print(r2_score(y, y_rand))

0.0150781980643796
-0.8025923866275462
-0.04055385703434533


In [24]:
# Evaluation - Mod 2 Dataset

print(r2_score(y_test, y_predict))
print(r2_score(y, y_persist))
print(r2_score(y, y_rand))

# Results show this method/dataset is probably the best method based off the scores

0.02238020454579126
-0.8025923866275462
-0.04055385703434533
