# ModelEDA class
This class is used to explore model predictions and scores after creating and optimizing models using `MyModel` class. (See class documentation)

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import os
import folium

In [6]:
def wilmott_index(y_measured, y_hat, c = 2):
    '''
    Compute the refined wilmott index
    '''
    y_avg = y_measured.mean()
    
    # part1 = sum( | predictions - measurements| )
    part1 = np.sum(np.abs(y_hat - y_measured))
    
    # part2 = c * sum( |measurements - meas_avg| )
    part2 = c * np.sum(np.abs(y_measured - y_avg))
    
    if part1 <= part2:
        return 1 - part1/part2
    
    else:
        return part2/part1 - 1

In [8]:
class ModelEDA:
    
    '''
    This class is used to explore model predictions and scores after creating and optimizing models using MyModel class.
    
    
    Parameters:
    -----------
    
    model_name: str
        Model name as used in MyModel
        
    algorithm: str
        Name of the algorithm used. The directory which has the models saved in should
        have the same name as `algorithm`.
        
    model_type: str
        If different sub-categories of the algorithms were used to create the models, this
        should be the name of the sub-category of the algorithm. Default is an empty string (No sub-category).
        Note that the models, predictions, etc. should be stored ina directory that has the same name as `model_type`
        and found inside a directory that has the same name as `algorithm`.
        
    save_path: str
        Path to the directory containing the algorithm/model_type directories.
        
    data_path: str
        Path to the directory containing the processed data. This directory should contain the files train.csv, test.csv,
        and station_definitions.csv.
        
    score_fun_dict: dict
        A dictionary of metric functions to be used for evaluating the model
        
    
    Methods:
    --------
    
    get_model():
        Return the MyModel object of the defined feature_combo
        
    get_predictions():
        Return the train or test predictions of the defined feature_combo
        
    get_train_test():
        Get the measurements of the train or test dataset.
        Returns a dataframe containing the columns: ['st_num', 'date', 'ET0']
        
    get_station_scores():
        Returns the scores per station.
        
    get_scores():
        Returns a dataframe containing the scores of the model.
        
    get_feature_table():
        Returns a table of features from the defined feature_combos.json file.
        
    compare_combinations():
        Returns a dataframe containing the scores of all feature combinations of the test or train dataset
    
    plot_combo_scores():
        Plot a bar plot comparing the scores of all feature combinations.
    
    '''
    
        
    
    def __init__(self,
                 model_name,
                 algorithm,
                 model_type='',
                 save_path = r"C:\Users\HP\Desktop\Git\evapotranspiration\part_3",
                 data_path = r'C:\Users\HP\Desktop\Git\evapotranspiration\processed_data',
                 score_fun_dict = {
                       'MAE': mean_absolute_error,
                       'RMSE': lambda x, y: np.sqrt(mean_squared_error(x, y)),
                       'R2': r2_score,
                       'WI': wilmott_index
                   }
                ):
        
        self.model_name = model_name
        self.save_path = os.path.join(save_path, algorithm, model_type)
        self.data_path = os.path.join(data_path)
        self.score_fun_dict = score_fun_dict
        self.algorithm = algorithm
    
    
    
    def get_model(self, feature_combo):
        
        '''
        Return the MyModel object of the defined feature_combo
        '''
        model_name = self.model_name + '_' + str(feature_combo)
       
        m = MyModel(model_name = model_name,
                    save_path = self.save_path,
                    data_path = self.data_path
                   )
        
        return m
    
    
    
    def get_predictions(self, feature_combo, train_test = 'test'):
        '''
        
        Return the train or test predictions of the defined feature_combo
        
        Parameters:
        -----------
        
        feature_combo: int
        
        train_test: str
            Determines whether to return train or test predictions
            
        '''
        
        assert train_test in ['train', 'test'], '"train_test" must be "train" or "test"'
        
        model_name = self.model_name + '_' + str(feature_combo)
        file_name = model_name + '_pred.npz'
        file_path = os.path.join(self.save_path, file_name)
        preds = np.load(file_path)
        
        name = 'y_hat_' + train_test
        y_hat = preds[name]
        
        return y_hat
    
    
    
    def get_train_test(self, train_test='test'):
        
        '''
        Get the measurements of the train or test dataset.
        Returns a dataframe containing the columns: ['st_num', 'date', 'ET0']
        '''
        
        assert train_test in ['train', 'test'], 'which must be "train" or "test"'
        
        file_name = train_test + '.csv'
        file_path = os.path.join(self.data_path, file_name)
        df = pd.read_csv(file_path, parse_dates = [2])[['st_num', 'date', 'ET0']]
        
        return df
      
        
    
    def get_station_scores(self, feature_combo):
        '''
        Returns the scores per station
        '''
        
        model_name = self.model_name + '_' + str(feature_combo)
        file_name = 'station_scores.csv'
        file_path = os.path.join(self.save_path, file_name)
        
        df = pd.read_csv(file_path)
        
        # Slicing the part with the desired model name and feature combination
        cond1 = df['name'] == model_name
        cond2 = df['feature_combo'] == feature_combo
        
        df = df.loc[cond1 & cond2].reset_index(drop=True)
        
        return df 
    
    
    
    def get_scores(self):
        
        '''
        Returns a dataframe containing the scores of the model
        '''
        
        file_name = 'scores.csv'
        file_path = os.path.join(self.save_path, file_name)
    
        df = pd.read_csv(file_path)
        
        cond = df['algorithm'] == self.algorithm
        
        return df.loc[cond].reset_index(drop=True)
    
    
    
    def get_feature_table(self,
                          path=r"C:\Users\HP\Desktop\Git\evapotranspiration\part_3\feature_combos.json",
                          long=True):
        
        '''
        Returns a table of features from the defined feature_combos.json file
        '''
        
        with open(path, 'r') as f:
            features = json.load(f)
            
        feature_dict =  {'latitude': 'Latitude',
                         'longitude': 'Longitude',
                         'elevation': 'Elevation',
                         'max_temp': 'Maximum temperature',
                         'min_temp': 'Minimum temperature',
                         'avg_temp': 'Average Temperature',
                         'avg_ws': 'Average wind speed',
                         'max_hum': 'Maximum humidity',
                         'min_hum': 'Minimum humidity',
                         'avg_hum': 'Average humidity',
                         'Rs': 'Solar radiation',
                         'inc_rad': 'Solar radiation',
                         'Ra': 'Extraterrestrial radiation',
                         'Rn': 'Net radiation',
                         'ET0': 'Reference evapotranspiration',
                         'month': 'Month'}
        
        if long==True:
            fun = lambda x: ', '.join([feature_dict[name] for name in x])
            vals = list(map(fun, features.values()))
            
        else:
            fun = lambda x: ', '.join(x)
            vals = list(map(fun, features.values()))
            
        features = dict(zip(features.keys(), vals))
        
        features = pd.DataFrame(features, index = [0]).transpose()
        features = features.set_index(np.arange(1, features.shape[0] + 1))
        features.index.name = 'Combo'
        features = features.rename(columns = {0: 'Features'})
        
        display(features.style.set_properties(subset=['Features'], **{'width': '500px'}))
        
        return features
    
            
    
    def compare_combinations(self,
                             train_test='test'):
        '''
        Returns a dataframe containing the scores of all feature combinations of the test or train dataset
        '''
        path = self.save_path
        scores = self.get_scores()
        
        cond = scores['train_test'] == train_test
        scores = scores[cond].reset_index(drop=True).sort_values(by='feature_combo')
        
        return scores
    
    
        
    def plot_combo_scores(self,
                          ax,
                          title=None,
                          metric = 'RMSE'):
        
        '''
        Plot a bar plot comparing the scores of all feature combinations.
        
        Parameters:
        -----------
        
        ax: matplotlib.axes
        
        title: str
        
        metric: str
            Name of metric to be used. The metric should be defined in the score_fun_dict
        '''
        
        train = self.compare_combinations(train_test = 'train')
        train = train[metric]
        
        test = self.compare_combinations(train_test = 'test')
        test = test[metric]
        
        bar_width = 0.2
        bar_offset = bar_width*0.5
        
        # Bar coordinates
        x = train.shape[0] # Number of bar groups to plot
        x = np.linspace(1, 10, x)
        
        # Bar offset from x coordinates
        diff = np.zeros(x.shape) + bar_offset
        
        x1 = x - diff
        ax.bar(x = x1, height=train.values, label = 'Train',
               color = 'skyblue', width = bar_width)
        
        x1 = x + diff
        ax.bar(x=x1, height=test.values, label='Test',
               color='navy', width=bar_width)
        
        # Annotation
        ax.set_xlabel('Combination')
        ax.set_ylabel(metric)
        ax.set_xticks(x)
        xticklabels = list(map(str, range(1, len(x)+1)))
        ax.set_xticklabels(xticklabels)
        if title is None:
            title = '{} per combination'.format(metric)
        ax.set_title(title)
        
        ax.legend(ncol = 2)
        ax.grid(axis='y')