# Modified Code from public API:
https://fitter.readthedocs.io/en/latest/_modules/fitter/fitter.html

In [1]:
import sys
import math
import threading
from datetime import datetime

import scipy.stats
import numpy as np
import pylab
import pandas as pd
from scipy.stats import entropy as kl_div
from scipy import optimize

from scipy.optimize import minimize
import time

In [2]:
__all__ = ['get_common_distributions', 'get_distributions', 'Fitter']

def get_distributions():
    distributions = []
    for this in dir(scipy.stats):
        if "fit" in eval("dir(scipy.stats." + this + ")"):
            distributions.append(this)
    return distributions


def get_common_distributions():
    return ['gamma', 'expon', 'exponpow', 'lognorm']
        
def loglikelihood_neg(param, dist):
    loglike_neg = -np.sum(dist[0].logpdf(dist[1], *param))
    #print(loglike_neg)
    return loglike_neg

In [1]:
class Fitter(object):
    def __init__(self, data, xmin=None, xmax=None, bins=100,
                 distributions=None, verbose=True, timeout=5000,
                 density=True):
        self.timeout = timeout
        # USER input
        self._data = None

        # Issue https://github.com/cokelaer/fitter/issues/22 asked for setting
        # the density to False in the fitting and plotting. I first tought it
        # would be possible, but the fitting is performed using the PDF of scipy
        # so one would still need to normalise the data so that it is
        # comparable. Therefore I do not see anyway to do it without using
        # density set to True for now.
        self._density = True

        #: list of distributions to test
        self.distributions = distributions
        if self.distributions == None:
            self._load_all_distributions()
        elif self.distributions == "common":
            self.distributions = get_common_distributions()
        elif isinstance(distributions, str):
            self.distributions = [distributions]

        self.bins = bins
        self.verbose = verbose

        self._alldata = np.array(data)
        if xmin == None:
            self._xmin = self._alldata.min()
        else:
            self._xmin = xmin
        if xmax == None:
            self._xmax = self._alldata.max()
        else:
            self._xmax = xmax

        self._trim_data()
        self._update_data_pdf()

        # Other attributes
        self._init()
        
    def _init(self):
        self.fitted_param = {}
        self.fitted_pdf = {}
        self._fitted_errors = {}
        self._log_lik = {}
        self._max_lik = {}
        self._aic = {}
        self._bic = {}
        self._kldiv = {}
        
    def _update_data_pdf(self):
        # histogram retuns X with N+1 values. So, we rearrange the X output into only N
        self.y, self.x = np.histogram(
            self._data, bins=self.bins, density=self._density)
        self.x = [(this + self.x[i + 1]) / 2. for i,
                  this in enumerate(self.x[0:-1])]

    def _trim_data(self):
        self._data = self._alldata[np.logical_and(
            self._alldata >= self._xmin, self._alldata <= self._xmax)]

    def _get_xmin(self):
        return self._xmin

    def _set_xmin(self, value):
        if value == None:
            value = self._alldata.min()
        elif value < self._alldata.min():
            value = self._alldata.min()
        self._xmin = value
        self._trim_data()
        self._update_data_pdf()
        #xmin = property(_get_xmin, _set_xmin, doc="consider only data above xmin. reset if None")

    def _load_all_distributions(self):
        """Replace the :attr:`distributions` attribute with all scipy distributions"""
        self.distributions = get_distributions()
        
    def hist(self):
        """Draw normed histogram of the data using :attr:`bins`


        .. plot::

            >>> from scipy import stats
            >>> data = stats.gamma.rvs(2, loc=1.5, scale=2, size=20000)
            >>> # We then create the Fitter object
            >>> import fitter
            >>> fitter.Fitter(data).hist()
            , label = "test"
            range = (x.min(), x.max())
            color = None
        """
        _ = pylab.hist(self._data, bins=self.bins, density=self._density)
        pylab.grid(False)

    def fit(self, amp=1):
        r"""Loop over distributions and find best parameter to fit the data for each

        When a distribution is fitted onto the data, we populate a set of
        dataframes:

            - :attr:`df_errors`  :sum of the square errors between the data and the fitted
              distribution i.e., :math:`\sum_i \left( Y_i - pdf(X_i) \right)^2`
            - :attr:`fitted_param` : the parameters that best fit the data
            - :attr:`fitted_pdf` : the PDF generated with the parameters that best fit the data

        Indices of the dataframes contains the name of the distribution.

        """
        for distribution in self.distributions:
            try:
                # need a subprocess to check time it takes. If too long, skip it
                dist = eval("scipy.stats." + distribution)

                # TODO here, dist.fit may take a while or just hang forever
                # with some distributions. So, I thought to use signal module
                # to catch the error when signal takes too long. It did not work
                # presumably because another try/exception is inside the
                # fit function, so I used threading with a recipe from stackoverflow
                # See timed_run function above
                param = self._timed_run(
                    dist.fit, distribution, args=self._data)

                # with signal, does not work. maybe because another expection is caught
                # hoping the order returned by fit is the same as in pdf
                pdf_fitted = dist.pdf(self.x, *param)

                self.fitted_param[distribution] = param[:]
                self.fitted_pdf[distribution] = pdf_fitted

                # calculate error
                sq_error = pylab.sum((self.fitted_pdf[distribution] - self.y)**2)
                
                # calculate information criteria
                log_lik = -np.sum(dist.logpdf(self.x, *param))  # joint probabilit
                k = len(param[:])     # num parameters in model
                n = len(self._data)   # num examples in data set
                aic = 2 * k - 2 * log_lik
                # online: (2 * k - 2 * log_lik)/n
                bic = n * np.log(sq_error / n) + k * np.log(n)

                kullback_leibler = kl_div(self.fitted_pdf[distribution], self.y)         
            
                max_lik = minimize(loglikelihood_neg, x0=[1]*k, args=[dist, self.x], method='nelder-mead').fun
                #print('---------------------')
                
                if self.verbose:
                    print("Fitted {} distribution with error={})".format(
                          distribution, sq_error))

                # compute some errors now
                self._fitted_errors[distribution] = sq_error
                self._log_lik[distribution] = log_lik
                self._max_lik[distribution] = max_lik
                self._aic[distribution] = aic
                self._bic[distribution] = bic
                self._kldiv[distribution] = kullback_leibler
            except Exception as err:
                if self.verbose:
                    print("SKIPPED {} distribution (taking more than {} seconds)".format(distribution,
                                                                                         self.timeout))
                # if we cannot compute the error, set it to large values
                self._fitted_errors[distribution] = np.inf
                self._log_lik[distribution] = np.inf
                self._max_lik[distribution] = np.inf
                self._aic[distribution] = np.inf
                self._bic[distribution] = np.inf
                self._kldiv[distribution] = np.inf

            self.df_errors = pd.DataFrame({'sumsquare_error': self._fitted_errors,
                                       'log_lik': self._log_lik,
                                       'max_lik': self._max_lik,
                                       'aic': self._aic,
                                       'bic': self._bic,
                                       'kl_div': self._kldiv})

    def plot_pdf(self, seg_type = "", names=None, Nbest=5, lw=2, method="sumsquare_error"):
        """Plots Probability density functions of the distributions

        :param str,list names: names can be a single distribution name, or a list
            of distribution names, or kept as None, in which case, the first Nbest
            distribution will be taken (default to best 5)

        """
        assert Nbest > 0
        if Nbest > len(self.distributions):
            Nbest = len(self.distributions)

        if isinstance(names, list):
            for name in names:
                pylab.plot(self.x, self.fitted_pdf[name], lw=lw, label=name)
        elif names:
            pylab.plot(self.x, self.fitted_pdf[names], lw=lw, label=names)
        else:
            try:
                names = self.df_errors.sort_values(by=method).index[0:Nbest]
            except Exception:
                names = self.df_errors.sort(method).index[0:Nbest]
            for name in names:
                if name in self.fitted_pdf.keys():
                    #if math.isinf(self.fitted_pdf[name]) or math.isinf(-1*self.fitted_pdf[name]):
                    pylab.plot(self.x, self.fitted_pdf[name], lw=lw, label=name)
                else:
                    print("%s was not fitted. no parameters available" % name)
                    
        title_map = {"sumsquare_error": "Sum Squared Error", "log_lik": "Log Liklihood", 
                     "aic": "Akaike Information Criterion", "bic": "Bayesian Information Criterion", 
                    "kl_div": "Kullback-Leibler Divergence", "max_lik": "Maximum Likelihood"}
        
        pylab.xlabel(seg_type + " Segment Length")
        pylab.ylabel('Frequency')
        pylab.title(title_map[method])
        pylab.grid(False)
        pylab.legend()
        
    def get_best(self, method='sumsquare_error'):
        """Return best fitted distribution and its parameters

        a dictionary with one key (the distribution name) and its parameters

        """
        # self.df should be sorted, so then us take the first one as the best
        name = self.df_errors.sort_values(method).iloc[0].name
        params = self.fitted_param[name]
        return {name: params}
    
    def summary(self, Nbest=5, seg_type = "", names = None, lw=2, plot=True, method="sumsquare_error"):
        """Plots the distribution of the data and Nbest distribution

        """
        if plot:
            pylab.clf()
            self.hist()
            self.plot_pdf(Nbest=Nbest, seg_type = seg_type, names = names, lw=lw, method=method)
            pylab.grid(False)

        Nbest = min(Nbest, len(self.distributions))
        try:
            names = self.df_errors.sort_values(by=method).index[0:Nbest]
        except:
            names = self.df_errors.sort(method).index[0:Nbest]
        return self.df_errors.loc[names]

    def _timed_run(self, func, distribution, args=(), kwargs={},  default=None):
        """This function will spawn a thread and run the given function
        using the args, kwargs and return the given default value if the
        timeout is exceeded.

        http://stackoverflow.com/questions/492519/timeout-on-a-python-function-call
        """
        class InterruptableThread(threading.Thread):
            def __init__(self):
                threading.Thread.__init__(self)
                self.result = default
                self.exc_info = (None, None, None)

            def run(self):
                try:
                    self.result = func(args, **kwargs)
                except Exception as err:
                    self.exc_info = sys.exc_info()

            def suicide(self):
                raise RuntimeError('Stop has been called')

        it = InterruptableThread()
        it.start()
        started_at = datetime.now()
        it.join(self.timeout)
        ended_at = datetime.now()
        diff = ended_at - started_at

        if it.exc_info[0] is not None:  # if there were any exceptions
            a, b, c = it.exc_info
            raise Exception(a, b, c)  # communicate that to caller

        if it.isAlive():
            it.suicide()
            raise RuntimeError
        else:
            return it.result

In [5]:
y_observed = np.random.normal(loc = 17, scale = 1, size = 1000)
intermediate_fit = Fitter(y_observed, distributions = "common", verbose=False)
intermediate_fit._set_xmin(0.05)
intermediate_fit.fit()

  if it.isAlive():


1595.4275402341702
1578.927183493661
1590.4275402341698
1524.3338166399624
1533.0875261607096
1501.476964696145
1460.2062482599772
1439.493801570089
1380.490543357426
1380.9443916634768
1303.50503296078
1214.7556387059822
1211.5924258521968
1123.5731863918775
1108.893369485084
1012.2821260938871
931.7346190557239
807.347570197826
814.4229040998022
709.6507783592704
605.3488566838854
582.21887513296
496.96370667648506
485.65786773228456
415.7379229791972
385.609890007237
351.00745251660857
348.89167191952777
360.27715524048824
352.55363870560427
382.0455279015929
356.8517454660526
359.63198582096766
349.4640119264729
365.89141082382935
347.50620821284394
354.79614342480744
347.4905955478144
351.2398178051229
347.4549452645751
349.44880652891294
347.2665765802279
350.0450757353618
346.8782347523248
348.27552758412594
347.00155669154395
347.3355730014943
346.98525459206917
346.60429489824236
346.6587068373568
346.61616286716355
347.752053067113
346.6624557993645
346.38766498854324
346.279

202.85552299826082
202.8529534724081
202.85188202017696
202.8582330888381
202.85606524090025
202.8520231833518
202.85133596109318
202.8515495934876
202.85066268387902
202.8503200320627
202.85467521149377
202.85118916371542
202.85037998039218
202.849928445832
202.84975090631048
202.84870596538144
202.848799935434
202.85183557271307
202.84967993990776
202.84923228158036
202.84851632308886
202.84837996684874
202.8471938741278
202.84638140013607
202.84823710123388
202.84673521825917
202.8513954623547
202.84732642408255
202.8461852878565
202.84705521184313
202.845169034282
202.84512748488197
202.84500762026073
202.84563743630662
202.84364631129577
202.84271918817512
202.84914877949512
202.84479818151286
202.84571133077887
202.84415629989317
202.84381227820384
202.84347400603642
202.84309639699669
202.84310079471703
202.84185167949127
202.84188712173207
202.8440182718881
202.84225247200257
202.8422011740928
202.8413793899027
202.8420686463035
202.84153736869447
202.84145195570702
202.8405457

205.79611212265536
205.796099188967
205.79609110536063
205.79611683439884
205.79610648283418
205.7960901151644
205.79608129530857
205.79607603304277
205.7960661895263
205.7960459885607
205.7960189783815
205.79602214199852
205.79601356830167
205.79602081713554
205.79596126659223
205.79591109596353
205.7959607286674
205.7959270004576
205.79584842549912
205.79577567461024
205.79577491891084
205.79569415136237
205.79564015182214
205.7955002320557
205.79541852842902
205.79522158028882
205.7951489569008
205.79487179319693
205.7946433636193
205.79412923289564
205.79413342513269
205.79353083618074
205.792768888084
205.79248247328144
205.79142621840126
205.79124885589832
205.78982716843294
205.78879928443897
205.7866662969172
205.78575677167836
205.78270269192143
205.7813060823423
205.77699945447156
205.77676991090792
205.77440195236244
205.76940471957008
205.76466184525535
205.76750138669806
205.77613208921173
205.77007362379115
205.7644734537872
205.76726902275578
205.75923408181524
205.75438

202.81509048223785
202.81508499445457
202.81508196826795
202.81507424231074
202.81507819103737
202.81506607870668
202.81505759549523
202.81506269225136
202.81504715117887
202.8150332557113
202.81504857976304
202.81502906121983
202.81501860707456
202.81501608560993
202.81501500165723
202.81498048555142
202.81495122816614
202.81499590163224
202.81497194427666
202.81493349349577
202.81492169006583
202.81487171339185
202.81481831104512
202.81486707244377
202.81484102798615
202.81477848434324
202.81477412849813
202.8146913986463
202.81461506129375
202.81462955872126
202.81463033230662
202.8144108885454
202.81424564264574
202.81452941597146
202.8144372285677
202.81416661795987
202.81401506423376
202.8138854006229
202.81363274421665
202.81334548121964
202.81280657608295
202.81280734509284
202.81209690545296
202.81116780398807
202.81071112844347
202.80933665810045
202.8097108257703
202.80695306274546
202.80562811841858
202.80683893233115
202.80987393109913
202.807775239473
202.80871643653717
2