In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
import math

In [2]:
with open('../../data/output/training_data/annual_mean_training_dataset_08-11-2020.json', 'r') as f:
    data = json.load(f)

filtered_data = [item for item in data 
  if item["median_elevation"] != None and
  item["MEAN"] < 100 and
  item["annual_precipitation"] < 6000 and 
  item["drainage_area"] < 200
]

df = pd.DataFrame(filtered_data)
df.head()

Unnamed: 0,STATION_NUMBER,YEAR,MEAN,MIN_MONTH,MIN,MAX_MONTH,MAX,average_slope,temperature_data,glacial_coverage,...,hydrological_zone,annual_precipitation,median_elevation,aspect,solar_exposure,LATITUDE,LONGITUDE,drainage_area,DRAINAGE_AREA_GROSS,FEATURE_AREA_SQM
0,07EC002,2011,2.23,4.0,1.633,6.0,3.987,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,8,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0
1,07EC002,2012,2.179,4.0,1.686,6.0,4.356,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,8,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0
2,07EC002,2013,2.058,4.0,1.626,6.0,3.67,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,8,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0
3,07EC002,2014,2.104,9.0,1.694,5.0,3.599,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,8,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0
4,07EC002,2015,2.223,4.0,1.777,5.0,4.075,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,8,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0


In [3]:
temperature = [] 
for temps in df["temperature_data"]:
    temperature.append(sum([monthTemps[2] for monthTemps in temps])/12)

df["temperature_data(average field)"] = temperature

In [4]:
df.head()

Unnamed: 0,STATION_NUMBER,YEAR,MEAN,MIN_MONTH,MIN,MAX_MONTH,MAX,average_slope,temperature_data,glacial_coverage,...,annual_precipitation,median_elevation,aspect,solar_exposure,LATITUDE,LONGITUDE,drainage_area,DRAINAGE_AREA_GROSS,FEATURE_AREA_SQM,temperature_data(average field)
0,07EC002,2011,2.23,4.0,1.633,6.0,3.987,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0,0.523384
1,07EC002,2012,2.179,4.0,1.686,6.0,4.356,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0,0.523384
2,07EC002,2013,2.058,4.0,1.626,6.0,3.67,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0,0.523384
3,07EC002,2014,2.104,9.0,1.694,5.0,3.599,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0,0.523384
4,07EC002,2015,2.223,4.0,1.777,5.0,4.075,9.043738,"[[-19.48458172733333, -10.212862086000001, -14...",0.0,...,557.153247,978.77699,100.059875,0.034945,55.916859,-124.567581,56.854111,5560.0,56854110.0,0.523384


In [5]:
features_df = df[['annual_precipitation', 'drainage_area', 'median_elevation', 'temperature_data(average field)', 'MEAN']]
features_df.head()

Unnamed: 0,annual_precipitation,drainage_area,median_elevation,temperature_data(average field),MEAN
0,557.153247,56.854111,978.77699,0.523384,2.23
1,557.153247,56.854111,978.77699,0.523384,2.179
2,557.153247,56.854111,978.77699,0.523384,2.058
3,557.153247,56.854111,978.77699,0.523384,2.104
4,557.153247,56.854111,978.77699,0.523384,2.223


In [6]:
X = features_df.drop(['MEAN'], axis=1)
y = features_df.MEAN

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [8]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

dtr_pred = dtr.predict(X_test)

print('Score Decision Tree Regressor', dtr.score(X_test, y_test))

# Mean square error Decision Tree Regressor
dtr_mse = MSE(y_test, dtr_pred)
print('Mean Suqare Error Decision Tree Regressor', dtr_mse)

# Root mean square error Decision Tree Regressor
dtr_rmse = math.sqrt(dtr_mse)
print('Root Mean Suqare Error Decision Tree Regressor', dtr_rmse)

Score Decision Tree Regressor 0.7136752954164036
Mean Suqare Error Decision Tree Regressor 112.0202115837864
Root Mean Suqare Error Decision Tree Regressor 10.583960108758271


In [9]:
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)

xgb_pred = xgb.predict(X_test)

print('Score XGBoost Regressor', xgb.score(X_test, y_test))

# Mean square error XGBoost Regressor
xgb_mse = MSE(y_test, xgb_pred)
print('Mean Suqare Error XGBoost Regressor', xgb_mse)

# Root mean square error XGBoost Regressor
xgb_rmse = math.sqrt(xgb_mse)
print('Root Mean Suqare Error XGBoost Regressor', xgb_rmse)

Score XGBoost Regressor 0.7132050168236201
Mean Suqare Error XGBoost Regressor 112.20420097283872
Root Mean Suqare Error XGBoost Regressor 10.592648439971882


In [10]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train, y_train)

rfr_pred = rfr.predict(X_test)

print('Score Decision Tree Regressor', rfr.score(X_test, y_test))

# Mean square error Random Forest Regressor
rfr_mse = MSE(y_test, rfr_pred)
print('Mean Suqare Error Random Forest Regressor', rfr_mse)

# Root mean square error Random Forest Regressor
rfr_rmse = math.sqrt(rfr_mse)
print('Root Mean Suqare Error Random Forest Regressor', rfr_rmse)

Score Decision Tree Regressor 0.7119864910392362
Mean Suqare Error Random Forest Regressor 112.68093076248616
Root Mean Suqare Error Random Forest Regressor 10.615127449187135


In [11]:
cat = CatBoostRegressor(random_state=42)
cat.fit(X_train, y_train)

Learning rate set to 0.048743
0:	learn: 18.9008460	total: 63.8ms	remaining: 1m 3s
1:	learn: 18.7401173	total: 98.7ms	remaining: 49.2s
2:	learn: 18.6040846	total: 145ms	remaining: 48.1s
3:	learn: 18.4596040	total: 170ms	remaining: 42.2s
4:	learn: 18.3436240	total: 215ms	remaining: 42.8s
5:	learn: 18.2191038	total: 226ms	remaining: 37.5s
6:	learn: 18.1373160	total: 237ms	remaining: 33.6s
7:	learn: 18.0374749	total: 255ms	remaining: 31.6s
8:	learn: 17.9564732	total: 279ms	remaining: 30.7s
9:	learn: 17.8725837	total: 290ms	remaining: 28.7s
10:	learn: 17.7796539	total: 302ms	remaining: 27.2s
11:	learn: 17.6755261	total: 320ms	remaining: 26.4s
12:	learn: 17.5379128	total: 351ms	remaining: 26.6s
13:	learn: 17.4423333	total: 380ms	remaining: 26.8s
14:	learn: 17.3525012	total: 406ms	remaining: 26.6s
15:	learn: 17.2835301	total: 433ms	remaining: 26.6s
16:	learn: 17.2025839	total: 445ms	remaining: 25.7s
17:	learn: 17.1321278	total: 488ms	remaining: 26.6s
18:	learn: 17.0358762	total: 500ms	remaini

157:	learn: 12.4951353	total: 2.29s	remaining: 12.2s
158:	learn: 12.4727106	total: 2.3s	remaining: 12.2s
159:	learn: 12.4553492	total: 2.33s	remaining: 12.2s
160:	learn: 12.4347250	total: 2.34s	remaining: 12.2s
161:	learn: 12.4152433	total: 2.36s	remaining: 12.2s
162:	learn: 12.4013685	total: 2.37s	remaining: 12.2s
163:	learn: 12.3946825	total: 2.38s	remaining: 12.2s
164:	learn: 12.3836243	total: 2.39s	remaining: 12.1s
165:	learn: 12.3699709	total: 2.4s	remaining: 12.1s
166:	learn: 12.3526216	total: 2.42s	remaining: 12.1s
167:	learn: 12.3442252	total: 2.42s	remaining: 12s
168:	learn: 12.3331947	total: 2.43s	remaining: 12s
169:	learn: 12.3222994	total: 2.44s	remaining: 11.9s
170:	learn: 12.3095348	total: 2.45s	remaining: 11.9s
171:	learn: 12.2962306	total: 2.46s	remaining: 11.9s
172:	learn: 12.2877105	total: 2.47s	remaining: 11.8s
173:	learn: 12.2750231	total: 2.49s	remaining: 11.8s
174:	learn: 12.2632798	total: 2.5s	remaining: 11.8s
175:	learn: 12.2564137	total: 2.53s	remaining: 11.8s


317:	learn: 11.0814633	total: 4.4s	remaining: 9.44s
318:	learn: 11.0786263	total: 4.41s	remaining: 9.42s
319:	learn: 11.0708579	total: 4.43s	remaining: 9.41s
320:	learn: 11.0651242	total: 4.5s	remaining: 9.53s
321:	learn: 11.0620653	total: 4.54s	remaining: 9.57s
322:	learn: 11.0580564	total: 4.55s	remaining: 9.54s
323:	learn: 11.0533801	total: 4.57s	remaining: 9.54s
324:	learn: 11.0475773	total: 4.58s	remaining: 9.52s
325:	learn: 11.0441738	total: 4.6s	remaining: 9.5s
326:	learn: 11.0418725	total: 4.61s	remaining: 9.49s
327:	learn: 11.0372418	total: 4.63s	remaining: 9.48s
328:	learn: 11.0307522	total: 4.64s	remaining: 9.46s
329:	learn: 11.0286459	total: 4.65s	remaining: 9.44s
330:	learn: 11.0246383	total: 4.67s	remaining: 9.44s
331:	learn: 11.0190778	total: 4.71s	remaining: 9.47s
332:	learn: 11.0142754	total: 4.74s	remaining: 9.5s
333:	learn: 11.0106003	total: 4.77s	remaining: 9.52s
334:	learn: 11.0059471	total: 4.83s	remaining: 9.6s
335:	learn: 11.0040198	total: 4.95s	remaining: 9.78s

477:	learn: 10.6430620	total: 7.54s	remaining: 8.23s
478:	learn: 10.6418905	total: 7.56s	remaining: 8.23s
479:	learn: 10.6401676	total: 7.58s	remaining: 8.21s
480:	learn: 10.6375741	total: 7.59s	remaining: 8.19s
481:	learn: 10.6366210	total: 7.6s	remaining: 8.17s
482:	learn: 10.6342903	total: 7.62s	remaining: 8.16s
483:	learn: 10.6330876	total: 7.64s	remaining: 8.14s
484:	learn: 10.6315146	total: 7.67s	remaining: 8.15s
485:	learn: 10.6296373	total: 7.7s	remaining: 8.14s
486:	learn: 10.6280345	total: 7.71s	remaining: 8.12s
487:	learn: 10.6270656	total: 7.74s	remaining: 8.12s
488:	learn: 10.6258856	total: 7.76s	remaining: 8.11s
489:	learn: 10.6238867	total: 7.78s	remaining: 8.1s
490:	learn: 10.6230422	total: 7.79s	remaining: 8.08s
491:	learn: 10.6217576	total: 7.81s	remaining: 8.06s
492:	learn: 10.6197957	total: 7.83s	remaining: 8.05s
493:	learn: 10.6182596	total: 7.84s	remaining: 8.03s
494:	learn: 10.6173555	total: 7.86s	remaining: 8.01s
495:	learn: 10.6166377	total: 7.87s	remaining: 7.

642:	learn: 10.4802115	total: 10.5s	remaining: 5.8s
643:	learn: 10.4787772	total: 10.5s	remaining: 5.79s
644:	learn: 10.4774818	total: 10.5s	remaining: 5.77s
645:	learn: 10.4768806	total: 10.5s	remaining: 5.75s
646:	learn: 10.4761386	total: 10.5s	remaining: 5.73s
647:	learn: 10.4752472	total: 10.5s	remaining: 5.71s
648:	learn: 10.4747743	total: 10.5s	remaining: 5.69s
649:	learn: 10.4742728	total: 10.5s	remaining: 5.67s
650:	learn: 10.4738964	total: 10.5s	remaining: 5.66s
651:	learn: 10.4729130	total: 10.6s	remaining: 5.64s
652:	learn: 10.4725321	total: 10.6s	remaining: 5.62s
653:	learn: 10.4718738	total: 10.6s	remaining: 5.6s
654:	learn: 10.4716490	total: 10.6s	remaining: 5.58s
655:	learn: 10.4708935	total: 10.6s	remaining: 5.56s
656:	learn: 10.4704935	total: 10.6s	remaining: 5.54s
657:	learn: 10.4698724	total: 10.6s	remaining: 5.52s
658:	learn: 10.4690386	total: 10.6s	remaining: 5.5s
659:	learn: 10.4678474	total: 10.7s	remaining: 5.49s
660:	learn: 10.4676639	total: 10.7s	remaining: 5.

804:	learn: 10.4141203	total: 14.4s	remaining: 3.48s
805:	learn: 10.4137931	total: 14.4s	remaining: 3.46s
806:	learn: 10.4135797	total: 14.4s	remaining: 3.45s
807:	learn: 10.4132230	total: 14.4s	remaining: 3.43s
808:	learn: 10.4130663	total: 14.5s	remaining: 3.42s
809:	learn: 10.4129091	total: 14.5s	remaining: 3.4s
810:	learn: 10.4126840	total: 14.5s	remaining: 3.38s
811:	learn: 10.4123564	total: 14.5s	remaining: 3.35s
812:	learn: 10.4121560	total: 14.5s	remaining: 3.34s
813:	learn: 10.4120183	total: 14.5s	remaining: 3.32s
814:	learn: 10.4118753	total: 14.5s	remaining: 3.3s
815:	learn: 10.4116844	total: 14.5s	remaining: 3.28s
816:	learn: 10.4113384	total: 14.6s	remaining: 3.26s
817:	learn: 10.4110450	total: 14.6s	remaining: 3.24s
818:	learn: 10.4106922	total: 14.6s	remaining: 3.23s
819:	learn: 10.4105023	total: 14.7s	remaining: 3.22s
820:	learn: 10.4102750	total: 14.7s	remaining: 3.2s
821:	learn: 10.4101543	total: 14.7s	remaining: 3.19s
822:	learn: 10.4099500	total: 14.7s	remaining: 3.

966:	learn: 10.3874629	total: 17.8s	remaining: 606ms
967:	learn: 10.3873465	total: 17.8s	remaining: 587ms
968:	learn: 10.3872660	total: 17.8s	remaining: 569ms
969:	learn: 10.3870995	total: 17.8s	remaining: 551ms
970:	learn: 10.3869177	total: 17.8s	remaining: 533ms
971:	learn: 10.3868443	total: 17.8s	remaining: 514ms
972:	learn: 10.3867040	total: 17.9s	remaining: 496ms
973:	learn: 10.3865843	total: 17.9s	remaining: 477ms
974:	learn: 10.3865229	total: 17.9s	remaining: 458ms
975:	learn: 10.3863802	total: 17.9s	remaining: 440ms
976:	learn: 10.3862778	total: 17.9s	remaining: 421ms
977:	learn: 10.3860793	total: 17.9s	remaining: 403ms
978:	learn: 10.3859215	total: 17.9s	remaining: 385ms
979:	learn: 10.3858398	total: 17.9s	remaining: 366ms
980:	learn: 10.3857766	total: 18s	remaining: 348ms
981:	learn: 10.3856467	total: 18s	remaining: 330ms
982:	learn: 10.3855938	total: 18s	remaining: 311ms
983:	learn: 10.3853554	total: 18s	remaining: 293ms
984:	learn: 10.3852754	total: 18s	remaining: 274ms
985

<catboost.core.CatBoostRegressor at 0x7f4b6a526970>

In [12]:
cat_pred = cat.predict(X_test)

print('Score CatBoost Regressor', cat.score(X_test, y_test))

# Mean square error CatBoost Regressor
cat_mse = MSE(y_test, cat_pred)
print('Mean Suqare Error CatBoost Regressor', cat_mse)

# Root mean square error CatBoost Regressor
cat_rmse = math.sqrt(cat_mse)
print('Root Mean Suqare Error CatBoost Regressor', cat_rmse)

Score CatBoost Regressor 0.7093640752616514
Mean Suqare Error CatBoost Regressor 113.70691128586758
Root Mean Suqare Error CatBoost Regressor 10.663344282441019
