Merge pull request #436 from brightwind-dev/dev

Pull request for v2.2.0 release
brightwind-dev · May 14, 2024 · 834be32 · 834be32
2 parents 968b20c + 470c15a
commit 834be32
Show file tree

Hide file tree

Showing 12 changed files with 578 additions and 155 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,16 @@ Given a version number MAJOR.MINOR.PATCH, increment the:
 
 Additional labels for pre-release and build metadata are available as extensions to the MAJOR.MINOR.PATCH format.
 
+## [2.2.0]
+1. Modify `Correl.OrdinaryLeastSquares()` to force the intercept to pass through the origin (Issue [#412](https://github.com/brightwind-dev/brightwind/issues/412)).
+1. Update `LoadBrightHub.get_data()` to use a new API (Issue [#419](https://github.com/brightwind-dev/brightwind/issues/419)).
+1. Added new function `LoadBrightHub.get_cleaning_log()` to pull the cleaning log for a particular measurement station on BrightHub (Issue [#405](https://github.com/brightwind-dev/brightwind/issues/405)).
+1. Added new function `LoadBrightHub.get_reanalysis()` to pull reanalysis datasets from BrightHub (Issue [#431](https://github.com/brightwind-dev/brightwind/issues/431)).
+1. Modify `load.apply_cleaning()` and `apply_cleaning_windographer()` to clean columns specified in cleaning file by 
+matching the sensor name from the beginning of the string. (Issue [#249](https://github.com/brightwind-dev/brightwind/issues/249)).
+
+
+
 ## [2.1.0]
 1. Update behaviour of `time_continuity_gaps` to find any gap that
 is not equal to the derived temporal resolution.

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Stephen Holleran
+Copyright (c) 2024 Stephen Holleran
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/brightwind/__init__.py b/brightwind/__init__.py
@@ -12,4 +12,4 @@
 
 __all__ = ['analyse', 'transform', 'export', 'load', 'demo_datasets']
 
-__version__ = '2.1.0'
+__version__ = '2.2.0'
diff --git a/brightwind/analyse/analyse.py b/brightwind/analyse/analyse.py
@@ -1325,10 +1325,13 @@ def time_continuity_gaps(data):
 
     The gaps are defined by showing the start and end timestamps just before and after the missing data periods.
 
-    A missing data period is one where data is not available for some consecutive timestamps. This breaks
-    time continuity of the data. The function derives the temporal resolution of the data by
-    finding the most common time difference between consecutive timestamps. Then it searches where the time
-    difference between consecutive timestamps does not match the resolution, this is the missing data period.
+    A missing data period is one where data is not available for some consecutive timestamps. Also, where a  timestamp
+    exists for a row of data, but where all values in that row are NaN's, this row will also be considered a time
+    continuity gap as it represents a break in the ordinary functioning of the logging unit.
+
+    The function derives the temporal resolution of the data by finding the most common time difference between
+    consecutive timestamps. Then it searches where the time difference between consecutive timestamps does not match
+    the resolution, this is the missing data period.
 
     It returns a DataFrame where the first column is the starting timestamp of the missing period (timestamp recorded
     immediately before the gap) and the second column is the end date of the missing period (timestamp recorded

diff --git a/brightwind/analyse/correlation.py b/brightwind/analyse/correlation.py
@@ -18,7 +18,8 @@
 
 class CorrelBase:
     def __init__(self, ref_spd, target_spd, averaging_prd, coverage_threshold=None, ref_dir=None, target_dir=None,
-                 sectors=12, direction_bin_array=None, ref_aggregation_method='mean', target_aggregation_method='mean'):
+                 sectors=12, direction_bin_array=None, ref_aggregation_method='mean', target_aggregation_method='mean',
+                 forced_intercept_origin=False):
 
         self.ref_spd = ref_spd
         self.ref_dir = ref_dir
@@ -28,6 +29,7 @@ def __init__(self, ref_spd, target_spd, averaging_prd, coverage_threshold=None,
         self.coverage_threshold = coverage_threshold
         self.ref_aggregation_method = ref_aggregation_method
         self.target_aggregation_method = target_aggregation_method
+        self.forced_intercept_origin = forced_intercept_origin
         # Get the name of the columns so they can be passed around
         self._ref_spd_col_name = ref_spd.name if ref_spd is not None and isinstance(ref_spd, pd.Series) else None
         self._ref_spd_col_names = ref_spd.columns if ref_spd is not None and isinstance(ref_spd, pd.DataFrame) else None
@@ -142,10 +144,16 @@ def plot(self, figure_size=(10, 10.2)):
                                           line_of_slope_1=True, figure_size=figure_size)
 
     @staticmethod
-    def _get_r2(target_spd, predict_spd):
+    def _get_r2(target_spd, predict_spd, forced_intercept_origin):
         """Returns the r2 score of the model"""
-        return 1.0 - (sum((target_spd - predict_spd) ** 2) /
-                      (sum((target_spd - target_spd.mean()) ** 2)))
+        if forced_intercept_origin:
+            x = np.nan_to_num(predict_spd.values.flatten()[:, np.newaxis])
+            y = np.nan_to_num(target_spd.values.flatten())
+            p, res = lstsq(x, y)[0:2]
+            return 1 - res / (y.size * y.var())
+        if not forced_intercept_origin:
+            return 1.0 - (sum((target_spd - predict_spd) ** 2) /
+                          (sum((target_spd - target_spd.mean()) ** 2)))
 
     @staticmethod
     def _get_logic_dir_sector(ref_dir, sector_min, sector_max):
@@ -345,6 +353,8 @@ class OrdinaryLeastSquares(CorrelBase):
                                       median, product, summation, standard deviation, variance, maximum and minimum
                                       respectively.
     :type target_aggregation_method:  str
+    :param forced_intercept_origin:   Default False; if set to True will force the regression to pass through [0; 0]
+    :type forced_intercept_origin:    boolean
     :returns:                         An object representing ordinary least squares fit model
 
     **Example usage**
@@ -398,35 +408,56 @@ class OrdinaryLeastSquares(CorrelBase):
                                                  averaging_prd='1H', coverage_threshold=0,
                                                  ref_aggregation_method='min', target_aggregation_method='min')
 
+        # Correlate wind speeds on a monthly basis and force the intercept through the origin.
+        ols_cor = bw.Correl.OrdinaryLeastSquares(m2_ne['WS50m_m/s'], data['Spd80mN'], averaging_prd='1M',
+                                                 coverage_threshold=0.95, forced_intercept_origin=True)
+
         # Correlate by directional sector, using 36 sectors.
         ols_cor = bw.Correl.OrdinaryLeastSquares(m2_ne['WS50m_m/s'], data['Spd80mN'],
                                                 ref_dir=m2_ne['WD50m_deg'], averaging_prd='1D',
                                                 coverage_threshold=0.9, sectors=36)
 
+        # Correlate by directional sector forcing the intercept through the origin.
+        ols_cor = bw.Correl.OrdinaryLeastSquares(m2_ne['WS50m_m/s'], data['Spd80mN'],
+                                                 ref_dir=m2_ne['WD50m_deg'], averaging_prd='1H',
+                                                 coverage_threshold=0.9, forced_intercept_origin=True)
     """
     def __init__(self, ref_spd, target_spd, averaging_prd, coverage_threshold=0.9, ref_dir=None, sectors=12,
-                 direction_bin_array=None, ref_aggregation_method='mean', target_aggregation_method='mean'):
+                 direction_bin_array=None, ref_aggregation_method='mean', target_aggregation_method='mean',
+                 forced_intercept_origin=False):
         CorrelBase.__init__(self, ref_spd, target_spd, averaging_prd, coverage_threshold, ref_dir=ref_dir,
                             sectors=sectors, direction_bin_array=direction_bin_array,
                             ref_aggregation_method=ref_aggregation_method,
-                            target_aggregation_method=target_aggregation_method)
+                            target_aggregation_method=target_aggregation_method,
+                            forced_intercept_origin=forced_intercept_origin)
 
     def __repr__(self):
         return 'Ordinary Least Squares Model ' + str(self.params)
 
     @staticmethod
-    def _leastsquare(ref_spd, target_spd):
-        p, res = lstsq(np.nan_to_num(ref_spd.values.flatten()[:, np.newaxis] ** [1, 0]),
-                       np.nan_to_num(target_spd.values.flatten()))[0:2]
-        return p[0], p[1]
+    def _leastsquare(ref_spd, target_spd, forced_intercept_origin=False):
+        if forced_intercept_origin:
+            x = np.nan_to_num(ref_spd.values.flatten()[:, np.newaxis])
+            y = np.nan_to_num(target_spd.values.flatten())
+            p, res = lstsq(x, y)[0:2]
+            return p[0], 0
+        elif not forced_intercept_origin:
+            p, res = lstsq(np.nan_to_num(ref_spd.values.flatten()[:, np.newaxis] ** [1, 0]),
+                           np.nan_to_num(target_spd.values.flatten()))[0:2]
+            return p[0], p[1]
 
     def run(self, show_params=True):
         if self.ref_dir is None:
+
             slope, offset = self._leastsquare(ref_spd=self.data[self._ref_spd_col_name],
-                                              target_spd=self.data[self._tar_spd_col_name])
+                                              target_spd=self.data[self._tar_spd_col_name],
+                                              forced_intercept_origin=self.forced_intercept_origin)
+
             self.params = dict([('slope', slope), ('offset', offset)])
             self.params['r2'] = self._get_r2(target_spd=self.data[self._tar_spd_col_name],
-                                             predict_spd=self._predict(ref_spd=self.data[self._ref_spd_col_name]))
+                                             predict_spd=self._predict(ref_spd=self.data[self._ref_spd_col_name]),
+                                             forced_intercept_origin=self.forced_intercept_origin)
+
             self.params['num_data_points'] = self.num_data_pts
         elif type(self.ref_dir) is pd.Series:
             self.params = []
@@ -435,11 +466,13 @@ def run(self, show_params=True):
                 # print('Processing sector:', sector)
                 if len(group) > 1:
                     slope, offset = self._leastsquare(ref_spd=group[self._ref_spd_col_name],
-                                                      target_spd=group[self._tar_spd_col_name])
+                                                      target_spd=group[self._tar_spd_col_name],
+                                                      forced_intercept_origin=self.forced_intercept_origin)
                     predict_ref_spd_sector = self._predict(ref_spd=group[self._ref_spd_col_name],
                                                            slope=slope, offset=offset)
                     r2 = self._get_r2(target_spd=group[self._tar_spd_col_name],
-                                      predict_spd=predict_ref_spd_sector)
+                                      predict_spd=predict_ref_spd_sector,
+                                      forced_intercept_origin=self.forced_intercept_origin)
                 else:
                     slope = np.nan
                     offset = np.nan
@@ -576,7 +609,8 @@ def run(self, show_params=True):
         output = model.run()
         self.params = dict([('slope', output.beta[0]), ('offset', output.beta[1])])
         self.params['r2'] = self._get_r2(target_spd=self.data[self._tar_spd_col_name],
-                                         predict_spd=self._predict(ref_spd=self.data[self._ref_spd_col_name]))
+                                         predict_spd=self._predict(ref_spd=self.data[self._ref_spd_col_name]),
+                                         forced_intercept_origin=False)
         self.params['num_data_points'] = self.num_data_pts
         # print("Model output:", output.pprint())
         if show_params: