In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import sklearn
from sklearn.svm import SVR
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


from scipy.spatial.distance import pdist, cdist
from scipy.spatial.distance import squareform

import folium
import os
import json
from joblib import dump, load

### Functions

In [3]:
def svr_grid_results(cv_results, param_list = ['C', 'gamma'], drop_split_scores = False):
    """
    Create a readable dataframe from grid search results.
    
    Parameters:
    -----------
    
    cv_results: Scikit-Learn GridSearchCV.cv_results_ attribute
    
    param_list: List of parameters optimized with grid search
    
    drop_split_scores: Boolean to decide whether to drop split scores or not
    
    """
    df = pd.DataFrame(cv_results)
    
    # Dropping split scores
    if drop_split_scores is True:
        cond = df.columns.str.contains('split')
        cols_to_drop = df.columns[cond]
        df = df.drop(columns = cols_to_drop)
    
    # Dropping params column
    df = df.drop(columns = ['params'])
    
    cols_to_rename = []
    for param in param_list:
        cond = df.columns.str.contains(param)
        cols_to_rename.append(df.columns[cond][0])
    
    df = df.rename(columns = dict(zip(cols_to_rename, param_list)))
    
    return df


def text_block(ax, x_start, y_start, main_text, text_list,
               main_line_spacing = -0.05, sub_line_spacing = -0.04, indentation = 0.1, fontsize = 12):
    
    """
    Write a text block inside an axes object:
    Example of a text block:
    
    Features:
        F1
        F2
    main_text = 'Features:'
    text_list = ['F1', 'F2']
    
    """
    # Initiating coordinates
    y_txt = y_start + main_line_spacing
    x_txt = x_start
    
    # Writing main text
    ax.text(x_txt, y_txt, main_text, fontsize = fontsize)
    
    # Updating coordinates
    y_txt += main_line_spacing
    x_txt += indentation
    
    # Iterating through text_list
    for txt in text_list:
        ax.text(x_txt, y_txt, txt, fontsize = fontsize)
        y_txt += sub_line_spacing
    
    # Resetting x to x_start
    x_txt = x_start
    
    return (x_txt, y_txt)


def wilmott_index(y_measured, y_hat, c = 2):
    '''
    Compute the refined wilmott index
    '''
    y_avg = y_measured.mean()
    
    # part1 = sum( | predictions - measurements| )
    part1 = np.sum(np.abs(y_hat - y_measured))
    
    # part2 = c * sum( |measurements - meas_avg| )
    part2 = c * np.sum(np.abs(y_measured - y_avg))
    
    if part1 <= part2:
        return 1 - part1/part2
    
    else:
        return part2/part1 - 1

### `MyModel` class

In [5]:
class MyModel:
    '''
    This class is used to automate some of the repetitive tasks in model
    creation and optimization.
    
    How to use MyModel class prior to model creation:
        1. Define model_name, feature_combo, and cluster_name (if studying several clusters)
        2. Create MyModel object
        3. Load train and test data using relevant methods
        4. Create and optimize model
        5. Add the model to MyModel object using m.add_model(model)
        6. Run m.get_scores(predict = True) to get model scores/errors and save predictions to attributes
        7. Create plots:
            * m.plot_predictions(): Plots predictions against measurements
            * m.plot_station_series(): Plots a time series of predictions against measurements
                                       of a certain station
        8. Run m.save_model() to save model and predictions.
        9. Run m.save_scores() to save model scores and features in the global CSV file
    
    How to use MyModel class after saving the model:
        1. Define `model_name`, `feature_combo`, and `cluster_name` (if available)
        2. Create MyModel object, model will be automatically loaded if found in the relative directory
        3. Load train and test data using relevant methods
        4. Run m.get_scores(predict = False) to get model scores/errors
        5. Create plots
        
    Parameters:
    -----------
    
    feature_combo: int
        Number of the feature combination
    
    model_name: str
        Model name
        
    regressor:
        Regressor object
        
    model_params: dict
        Regressor object init parameters
        
    fit_params: dict
        Regressor fit parameters
    
    target:
        Target variable. Default is 'ET0'
    
    save_path:
        Path to the directory where the model and predictions are saved.
        Default is '' (model is in the same directory)
                
    data_path:
        Path to the directory where the train and test datasets are saved
    
    cluster_name:
        Name of the cluster if available
    
    score_fun_dict:
        A dictionary of metric functions to be used for evaluating the model
    
    
    Methods:
    --------
    
    m.get_feature_names():
        Get the full feature names. Used internally to create the attribute m.feature_names
                           
    m.get_train_test():
        Get training dataset from the defined data_path.
        Returns the tuple (train, test)
                        
    m.get_Xy():
        Returns (X_train, X_test, y_train, y_test)
        Assigns them to attributes
        Assigns dataframes of train/test station numbers with measurement dates to attributes
        (m.train_stations, m.test_stations)
                
    m.add_model():
        Assign model to attribute m.model
    
    m.load_model():
        Load model from the defined model_path and assign it to attribute m.model
        Also loads predictions from the pred_path and assigns them to attributes
                    
    m.save_model():
        Save model and predictions to the defined model_path
    
    m.get_scores():
        Predict y_train and y_test then compute evaluation metrics.
        Predictions and metrics are assigned to attributes
        
    m.get_station_scores():
        Get scores per weather station.
                    
    m.plot_predictions():
        Plots predictions against measurements
    
    m.plot_station_series():
        Plots a time series of predictions against measurements of a certain station
                             
    m.save_scores():
        Save model evaluation metrics to a CSV file. This CSV file
        contains the scores of all models of the same algorithm.
                     
    Attributes:
    -----------
    
    m.model_name
    
    m.feature_dict:
        Internally defined dictionary of feature column names vs extended names
    
    m.features:
        List of feature column names used in the model
    
    m.feature_names:
        List of extended feature names used in the model
    
    m.train_path:
        Path to training dataset
    
    m.test_path
        Path to test dataset
        
    m.regressor
        Regressor object
        
    m.model_params
        Regressor object init parameters
        
    m.fit_params
        Regressor fit parameters
    
    m.target:
        Name of target variable
    
    m.save_path:
        Path to the directory to save the model, predictions, parameters, and scores to.
        
    m.model_path:
        Path to model file
    
    m.pred_path:
        Path to the .npz file containing predictions
    
    m.X_train, m.X_test, m.y_train, m.y_test:
        Assigned using m.get_Xy()
    
    m.test_stations, m.train_stations:
        Dataframe containing station numbers and dates assigned using m.get_Xy()
                                       
    m.train_time
    
    
    
    '''
    def __init__(self,
                 model_name,
                 cluster_name = None,
                 feature_combo = None,
                 regressor=None,
                 model_params=None,
                 fit_params=None,
                 save_path = '',
                 data_path = r"C:\Users\HP\Desktop\Git\evapotranspiration\processed_data",
                 target = 'ET0',
                 score_fun_dict = {
                       'MAE': sklearn.metrics.mean_absolute_error,
                       'RMSE': lambda x, y: np.sqrt(sklearn.metrics.mean_squared_error(x, y)),
                       'R2': sklearn.metrics.r2_score,
                       'WI': wilmott_index
                   }):
        
        
        self.model_name = model_name
        
        self.save_path = save_path
        self.model_path = os.path.join(save_path, self.model_name + '.joblib')
        self.pred_path = os.path.join(save_path, self.model_name + '_pred.npz')
        
        self.score_fun_dict = score_fun_dict
        
        self.feature_combo = feature_combo
        
        self.cluster_name = cluster_name
        
        self.regressor = regressor
        
        self.model_params = model_params
        
        self.fit_params = fit_params
        
        
        # Using local paths, Github requires openning a session and entering user/pass
        train_file_name = 'train.csv'
        test_file_name = 'test.csv'
        
        if cluster_name is not None:
            # Data is expected to be in a folder named by the cluster name
            data_path = os.path.join(data_path, cluster_name)
            
            train_file_name = cluster_name + '_' + train_file_name
            test_file_name = cluster_name + '_' + test_file_name
            
        self.train_path = os.path.join(data_path , train_file_name)
        self.test_path = os.path.join(data_path , test_file_name)
        
        # Target variable
        self.target = target
        
        # Load model if saved and print a message if not found
        if self.load_model_() == True:
            print('{} model loaded from {}'.format(model_name, save_path))
            return
        
        # Else if the model is not saved
        self.features, self.feature_names = self.get_features(feature_combo)
        
        print('Model not found at {}'.format(self.model_path))
        print('MyModel object created without model')
        
        
          
    def get_features(self, feature_combo):
        '''
        Get full features from the JSON feature dictionary based on the combination number
        '''
        
        feature_dict =  {'latitude': 'Latitude',
                         'longitude': 'Longitude',
                         'elevation': 'Elevation',
                         'max_temp': 'Maximum temperature',
                         'min_temp': 'Minimum temperature',
                         'avg_temp': 'Average Temperature',
                         'avg_ws': 'Average wind speed',
                         'max_hum': 'Maximum humidity',
                         'min_hum': 'Minimum humidity',
                         'avg_hum': 'Average humidity',
                         'Rs': 'Solar radiation',
                         'inc_rad': 'Solar radiation',
                         'Ra': 'Extraterrestrial radiation',
                         'Rn': 'Net radiation',
                         'ET0': 'Reference evapotranspiration',
                         'month': 'Month'}
        
        # Getting features from the feature combos dictionary
        path_to_feature_combos_dict = r"C:\Users\HP\Desktop\Git\evapotranspiration\part_3"
        file_path = os.path.join(path_to_feature_combos_dict, 'feature_combos.json')
        
        assert os.path.isfile(file_path) == True,\
        'feature_combos dictionary not found at {}'.format(path_to_feature_combos_dict)
        
        with open(file_path, 'r') as file:
            feature_combos = json.load(file)
        
        feature_combo = str(feature_combo)
        
        assert feature_combo in feature_combos.keys(),\
        'Feature combination {} is not defined in the feature_combos dictionary'.format(feature_combo)
        
        features = feature_combos[feature_combo]
        
        feature_names = []
        for feature in features:
            feature_names.append(feature_dict[feature])
        
        return features, feature_names
    
    
    
    def get_train_test(self, how = 'selected_features'):
        """
        Get training dataset from the defined data_path.
        Returns the tuple (train, test)
        
        Parameters:
        -----------
        
        how:
            'all' return all training dataset with all features
            'selected_features' return training dataset with selected features only
             
        """
        train = pd.read_csv(self.train_path, parse_dates = [1])
        test = pd.read_csv(self.test_path, parse_dates = [1])
        
        if how == 'all':
            return (train, test)
        
        elif how == 'selected_features':
            cols = ['st_num', 'date'] + self.features + [self.target]
            return (train[cols], test[cols])
    
    

    def get_Xy(self):
        """
        Returns (X_train, X_test, y_train, y_test)
        Assigns them to attributes
        Assigns dataframes of train/test stations with dates to attributes
        (m.train_stations, m.test_stations)
        """
        
        # Loading train and test datasets
        train, test = self.get_train_test(how = 'selected_features')
        
        X_train = train[self.features].to_numpy()
        X_test = test[self.features].to_numpy()

        # Checking if X has 1 dimension only
        if len(self.features) < 2:
            X_train = X_train.reshape(-1, 1)
            X_test = X_test.reshape(-1, 1)

        y_train = train[self.target].to_numpy()
        y_test = test[self.target].to_numpy()
        
        if not hasattr(self, 'X_train'):
            self.X_train = X_train
            self.X_test = X_test
            self.y_train = y_train
            self.y_test = y_test
        
        # Saving station numbers
        self.test_stations = test[['st_num', 'date']]
        self.train_stations = train[['st_num', 'date']]

        return (X_train, X_test, y_train, y_test)
    
    
    
    def add_model_(self, model, train_time):
        '''
        Assign model to attribute m.model
        '''
        self.model = model
        self.train_time = train_time
    
    
    
    def fit_model(self, print_vals=True):
        '''
        Initialize the regressor and fit it to training data using the defined model_params and fit_params
        
        Paarameters:
        ------------
        
        print_vals: Bool
            Whether to print the model scores or not
            
        '''
        X_train, X_test, y_train, y_test = self.get_Xy()
        
        model = self.regressor(**self.model_params)
        
        model.fit(X_train, y_train, **self.fit_params)
        
        self.add_model_(model, model.train_time)
        
        self.get_scores(predict=True, print_vals=print_vals)
        
        
        
    def load_model_(self):
        '''
        Load model and predictions from the defined model_path and assign it to attribute m.model
        and returns True.
        If the model is not available at the defined path, returns False.
        '''
        if os.path.exists(self.model_path):
            self.model = load(self.model_path)
            preds = np.load(self.pred_path)
        
            for pred in ['y_hat_train', 'y_hat_test', 'feature_combo']:
                setattr (self, pred, preds[pred])
                
            # Feature combo was saved as an array
            self.feature_combo = self.feature_combo[0]
            
            # Getting the list of features to be used
            self.features, self.feature_names = self.get_features(self.feature_combo)
            
            # Adding X_train, y_train, X_test, y_test, train_stations, test_stations attributes
            _ = self.get_Xy()
            
            return True
        else:
            return False
        
        

    def get_scores(self,
                   print_vals = True,
                   predict = False):
        
        '''
        Predict y_train and y_test then compute evaluation metrics.
        Predictions and metrics are assigned to attributes:
        
        self.y_hat_train: Training set predictions
        self.y_hat_test: Test set predictions
        
        self.train_scores: Dictionary of training eval. metrics
        self.test_scores: Dictionary of test eval. metrics
        
        Parameters:
        -----------
        
        print_vals: Print evaluation metrics if True
        
        predict: Predict from data if True
                 Use saved predictions if False
        '''
        
        # Getting train and test predictions
        if predict is True:
            self.y_hat_test = self.model.predict(self.X_test)
            self.y_hat_train = self.model.predict(self.X_train)
        else:
            assert (hasattr(self, 'y_hat_train') and hasattr(self, 'y_hat_test')),\
            'Load predictions or set predict = True to predict target variable' 
        
        train_scores = {}
        test_scores = {}
        
        for metric, fun in self.score_fun_dict.items():
            train_scores[metric] = fun(self.y_train, self.y_hat_train)
            test_scores[metric] = fun(self.y_test, self.y_hat_test)
        

        # Printing scores
        if print_vals is True:
            print('Train scores:')
            for metric, val in train_scores.items():
                print('{} = {:.3f}'.format(metric, val))
            
            print('\n')
            print('Test scores:')
            for metric, val in test_scores.items():
                print('{} = {:.3f}'.format(metric, val))
                
        self.train_scores = train_scores
        self.test_scores = test_scores
        
        
        
    def get_station_scores(self):
        '''
        Get scores for every station to compare the performance of the model in different stations.
        '''
        
        
        
        all_stations = np.concatenate([self.test_stations['st_num'].unique(),
                                       self.train_stations['st_num'].unique()])
        
        
        df = {
            'st_num': all_stations,
            'dataset': []
        }
        
        # Creating a dictionary of score_metric: empty list
        scores = {key: [] for key in self.score_fun_dict.keys()}
        df.update(scores)
        
        for st in all_stations:
            if st in self.test_stations['st_num'].unique():
                cond = self.test_stations['st_num'] == st
                # Extracting index of the station "st"
                idx = self.test_stations[cond].index
                y = self.y_test[idx]
                y_hat = self.y_hat_test[idx]
                df['dataset'].append('test')
            else:
                cond = self.train_stations['st_num'] == st
                # Extracting index of the station "st"
                idx = self.train_stations[cond].index
                y = self.y_train[idx]
                y_hat = self.y_hat_train[idx]
                df['dataset'].append('train')
            
            
            # Computing score metric values
            for metric, fun in self.score_fun_dict.items():
                df[metric].append(fun(y, y_hat))


        df = pd.DataFrame(df)
        
        # Adding algorithm name and model name
        df.insert(0, 'name', self.model_name)
        df.insert(0, 'feature_combo', self.feature_combo)
        df.insert(0, 'algorithm', self.model.algorithm)
        
        return df
    
    
    
    def plot_map(self,
                 path_to_st_def = r"C:\Users\HP\Desktop\Git\evapotranspiration\processed_data\station_definitions.csv",
                 marker_radius = 3,
                 error_metric = 'RMSE',
                 colors = ['black', 'blue'],
                 location = [37.5, 28.5],
                 zoom = 8):
        '''
        Plot stations in data with pop-ups for st_num and scores.
        
        
        Parameters:
        -----------
        
        path_to_st_def: str
            Path to the file containing station definitions (station_definitions.csv)
        
        marker_radius: int
            Radius of the folium marker.
        
        error_metric: str ('RMSE' or 'MAE')
            Error metric to use for scaling marker sizes. Larger markers have larger errors.
            If None, all markers will have the same size.
        
        colors: list
            List of color names to define the color of markers.
            First color is for training stations and second is for test stations.
        
        location: list
            Location of the center of the map to pass to folium.Map()
        
        zoom: int
            Initial zoom of map to pass to folium.Map()
        '''
        
        
        tiles = 'https://stamen-tiles-{s}.a.ssl.fastly.net/terrain-background/{z}/{x}/{y}{r}.png'
        attr = 'Map tiles by <a href="http://stamen.com">Stamen Design</a>, <a href="http://creativecommons.org/licenses/by/3.0">CC BY 3.0</a> &mdash; Map data &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors'

        # Loading station definitions
        st_def = pd.read_csv(path_to_st_def)[['st_num', 'latitude', 'longitude']]
        
        # Merging station definitions with station scores
        df = self.get_station_scores()
        df = df.merge(st_def, on = 'st_num')
        
        
        m = folium.Map(location = location, zoom_start = zoom, tiles = tiles, attr = attr)


        df['color'] = df['dataset'].replace({'train': colors[0], 'test': colors[1]})
        
        if error_metric is not None:
            # Normalizing errors
            max_error = df[error_metric].max()
            min_error = df[error_metric].min()
            df['radius'] = 0.5 + 1.5*(df[error_metric] - min_error)/(max_error - min_error)
            
            df['radius'] = df['radius'] * marker_radius
        else:
            df['radius'] = marker_radius
            
        for row in df.iterrows():
            row = row[1]
            
            # Location of station
            loc = [row['latitude'], row['longitude']]
            
            # Text in pop-up
            txt = ''
            txt += '<h5><b>Station: {}\n</h5></b>'.format(row['st_num'])
            for metric in ['MAE', 'RMSE', 'R2', 'WI']:
                txt += '{}: {:.3f}<br>'.format(metric, row[metric])
                
            popup = folium.Popup(txt, min_width = 100, max_width = 100)
            folium.CircleMarker(location = loc,
                                fill = True,
                                fill_opacity = 0.8,
                                opacity = 1,
                                color = row['color'],
                                radius = row['radius']
                                ).add_child(popup).add_to(m)

        display(m)
        
        return m

  
        
    
    def plot_predictions(self, title, alpha = 0.8, show_scores = True, dpi = 70):
    
        """
        Plot real values vs. predictions.

        Parameters:
        -----------

        alpha: Opacity of scatter plot points

        show_scores: Writes model properties and evaluation metrics on the right
                     side of the graph
        """
        # Predicting
        assert hasattr(self, 'model'), 'Model not loaded, add or load model first'
        
        if not hasattr(self, 'train_scores') or not hasattr(self, 'test_scores'):
            self.get_scores(print_vals = False)
        

        if show_scores is True:
            fig, axs = plt.subplots(1, 2, figsize = (12, 8),
                                    gridspec_kw={'width_ratios': [3, 1]}, dpi = dpi, tight_layout = True)
            ax = axs[0]
            ax1 = axs[1]
            
            eval_list_train = []
            for metric, val in self.train_scores.items():
                eval_list_train.append('{}:  {:.3f}'.format(metric, val))
            
            eval_list_test = []
            for metric, val in self.test_scores.items():
                eval_list_test.append('{}:  {:.3f}'.format(metric, val))

            # Inserting texts to plot
            x_start = 0.05
            y_start = 1

            x_start, y_start = text_block(ax1, x_start, y_start, 'Model:', [self.model_name])

            x_start, y_start = text_block(ax1, x_start, y_start, 'Features:', self.feature_names)

            x_start, y_start = text_block(ax1, x_start, y_start, 'Train:', eval_list_train)
            
            x_start, y_start = text_block(ax1, x_start, y_start, 'Test:', eval_list_test)

            # Turning axis off
            ax1.axis('off')
        else:
            fig, ax = plt.subplots(figsize = (8, 8), dpi = dpi, tight_layout = True)


        # Setting Title
        ax.set_title(title, fontsize = 14)

        # Plotting true to prediction points
        ax.plot(self.y_test, self.y_hat_test, 'x', color = 'blue', alpha = alpha)

        # Plotting reference line
        ref_line = [min(self.y_test.min(), self.y_hat_test.min()) * 0.9,
                    max(self.y_test.max(), self.y_hat_test.max()) * 1.03]
        
        ax.plot(ref_line, ref_line, '--', color = 'black', linewidth = 3)

        ax.set_xlabel('Measured Values', fontsize = 12)
        ax.set_ylabel('Predicted Values', fontsize = 12)
        ax.axis('equal')

        return ax
    
    
    
    def plot_station_series(self, st_num = None, which = 'test', show_scores = True, dpi = 70):
        
        '''
        Plot a time series of a certain station.
        
        Parameters:
        -----------
        
        st_num: int
            The number of the station to plot. If None, a random station is plotted.
            
        which: str 'test' or 'train'
            The dataset from which the station is chosen, train or test.
        
        show_scores: bool
            Determines whether to display the scores of the plotted station or not.
        '''
        # Getting a series of all stations and a list of unique stations
        # based on 'which' parameter (test or train)
        if which == 'test':
            station_df = self.test_stations
            y_measured = self.y_test
            y_hat = self.y_hat_test
        else:
            station_df = self.train_stations.sort_values(by = ['st_num', 'date'])
            
            # Resorting training data (Previously shuffled for cross-validation)
            idx = station_df.index
            y_measured = self.y_train[idx]
            y_hat = self.y_hat_train[idx]

        stations_available = station_df['st_num'].unique()
        
        # Get a random station if station number is not defined
        if st_num is None:
            st_to_plot = np.random.choice(stations_available)
        
        else:
            assert st_num in stations_available, 'Station {} not found in {} stations'.format(st_num, which) 
            st_to_plot = st_num

        cond = station_df['st_num'] == st_to_plot
        idx = station_df[cond].index
        
        x = station_df.loc[idx, 'date'].to_list()

        y_measured = y_measured[idx]
        y_hat = y_hat[idx]

        if show_scores == True:
            fig, axs = plt.subplots(1, 2,
                                   figsize = (21, 6),
                                   gridspec_kw={'width_ratios': [5, 1]},
                                   tight_layout = True,
                                   dpi = dpi)

            ax = axs[0]
            
            ax1 = axs[1]
            
            # Inserting scores to plot
            station_scores = self.get_station_scores()
            
            cond = station_scores['st_num'] == st_to_plot
            station_score = station_scores[cond]
            
            eval_list = []
            for metric in ['MAE', 'RMSE', 'R2', 'WI']:
                val = station_score[metric].to_numpy()[0]
                eval_list.append('{}:  {:.3f}'.format(metric, val))
            
            x_start = 0.05
            y_start = 1

            x_start, y_start = text_block(ax1, x_start, y_start, 'Dataset:', station_score['dataset'].to_numpy())

            x_start, y_start = text_block(ax1, x_start, y_start, 'Scores:', eval_list)
            
            # Turning axis off
            ax1.axis('off')
            
        else:
            fig, ax = plt.subplots(figsize = (16, 5),
                                   tight_layout = True,
                                   dpi = dpi)
            
        ax.set_title('ET0 Measurements vs Predictions / Station: {}'.format(st_to_plot))
            
        ax.plot(x, y_measured, 'green', label = 'Measured ET0', linestyle = '--', linewidth = 2.5)
        ax.plot(x, y_hat, 'skyblue', label = 'Predicted ET0')
        
        ax.set_xlabel('Dates')
        ax.set_ylabel('ET0 (mm/day)')
        ax.legend()
        
        return ax
    
    
    
    def save_params_(self,
                     file_name = 'params.csv'
                   ):
        '''
        Save the parameters of the model to a CSV file.
        The method checks if the defined file_name is available in the defined directory,
        and appends the parameters to that file. Else, a new file is created.
        
        
        Parameters:
        -----------
        
        file_name: str
        '''
        
        file_path = os.path.join(self.save_path, file_name)
        
        # Getting the parameter values from the model object
        param_dict = self.model.get_params()
        
        # Creating a dictionary of the new entries
        new_dict = {}

        new_dict['algorithm'] = self.model.algorithm
        new_dict['feature_combo'] = self.feature_combo
        new_dict['name'] = self.model_name
        new_dict['train_time'] = self.model.train_time
        new_dict.update(param_dict)
        
        # Checking if file is available
        if os.path.isfile(file_path) is not True:
            param_df = pd.DataFrame(new_dict, index = [0])
        else:
            param_df = pd.read_csv(file_path)
            param_df = param_df.append(pd.DataFrame(new_dict, index = [0]), ignore_index = True)
            param_df = param_df.drop_duplicates(subset = ['algorithm', 'feature_combo', 'name'],
                                                keep = 'last')
        
        param_df.to_csv(file_path, index = False)
        

        
        
    def save_scores_(self,
                     scores_file_name = 'scores.csv',
                     station_scores_file_name = 'station_scores.csv'
                   ):
        
        '''
        Save model evaluation metrics to a CSV file. If the file exists values are appended to it
        and duplicates are dropped. Scores (evaluation metrics) per station are saved to the file defined in
        station_scores_file_name.
        
        Parameters:
        -----------
            
        scores_file_name: str
            Name of the file including total scores
            
        station_scores_file_name: str
            Name of the file including scores per station

        '''
        file_path = os.path.join(self.save_path, scores_file_name)
        
        train_dict = {
            'algorithm': self.model.algorithm,
            'feature_combo': self.feature_combo,
            'name': self.model_name,
            'train_test': 'train'
        }
        
        train_dict.update(self.train_scores)
        
        test_dict = {
            'algorithm': self.model.algorithm,
            'feature_combo': self.feature_combo,
            'name': self.model_name,
            'train_test': 'test'
        }
        
        test_dict.update(self.test_scores)
        
        # Checking if file is available
        if os.path.isfile(file_path) is not True:
            score_df = pd.DataFrame([train_dict, test_dict])
        else:
            score_df = pd.read_csv(file_path)
            score_df = score_df.append(pd.DataFrame([train_dict, test_dict]), ignore_index = True)
            score_df = score_df.drop_duplicates(subset = ['algorithm', 'feature_combo', 'name', 'train_test'],
                                                keep = 'last')
        
        score_df.to_csv(file_path, index = False)
        
        # Saving station scores
        station_scores = self.get_station_scores()
        
        file_path = os.path.join(self.save_path, station_scores_file_name)
        
        # Checking if file is available
        if os.path.isfile(file_path) is not True:
            station_scores.to_csv(file_path, index = False)

        else:
            old_station_scores = pd.read_csv(file_path)
            df = old_station_scores.append(station_scores, ignore_index = True)
            df = df.drop_duplicates(subset = ['algorithm', 'feature_combo', 'name', 'st_num'], keep = 'last')
            df.to_csv(file_path, index = False)
            
            
        
    def save_model_(self):
        '''
        Save model and predictions to the defined model_path
        '''
        # Saving model
        dump(self.model, self.model_path)
        
        # Saving predictions
        assert (hasattr(self, 'y_hat_train') and hasattr(self, 'y_hat_test')),\
            'Predictions not available, only model was saved'
        
        preds = ['y_hat_train', 'y_hat_test']
        preds = dict([(p, getattr(self, p)) for p in preds])
        # Adding feature combination number
        preds['feature_combo'] = [self.feature_combo]
        np.savez(self.pred_path, **preds)
        
            
    def save(self):
        '''
        Save model, model predictions, model scores, model parameters, and station scores
        to the defined save path.
        '''
        
        self.save_scores_()
        self.save_model_()
        self.save_params_()
        
        

### PUK Kernel

In [5]:
def puk(x1, x2, sigma, omega):
    '''
    Pearson VII Universal Kernel function
    Based on: https://github.com/rlphilli/sklearn-PUK-kernel/blob/master/PUK_kernel.py
    '''
    
    dis = cdist(x1, x2, 'sqeuclidean')
    
    kernel = (1 + 4 * dis * (2**(1.0/omega) - 1) / sigma**2)**omega
    
    return 1 / kernel


class PUK(BaseEstimator,TransformerMixin):
    def __init__(self, sigma = 1, omega = 10):
        super(PUK,self).__init__()
        self.sigma = sigma
        self.omega = omega

    def transform(self, X):
        return puk(X, self.X_train_, sigma = self.sigma, omega = self.omega)

    def fit(self, X, y=None):
        self.X_train_ = X
        return self