
### Timing binning arrays requiring statitics for both; given same grid range from single array data points

Amnah Siddiqa - 09022021



In [1]:
import pandas as pd
import sys
import os 
import numpy as np
from scipy import stats
import time

#### My parameters memory ref
- bins int or sequence of scalars, optional
    
    If bins is an int, it defines the number of equal-width bins in the given range (10 by default). 
    
    If bins is a sequence, it defines the **bin edges**, including the rightmost edge, allowing for non-uniform bin widths. Values in x that are smaller than lowest bin edge are assigned to bin number 0, values beyond the highest bin are assigned to bins[-1]. If the bin edges are specified, the number of bins will be, (nx = len(bins)-1)
    
- x(N,) array_like
    A sequence of values to be binned.

- **values(N,)** array_like or list of (N,) array_like
    The data on which the statistic will be computed. This must be the same shape as x, or a set of sequences - each the same shape as x. If values is a set of sequences, the statistic will be computed on each independently.  
        

In [2]:
data = np.loadtxt("/Users/siddia/Desktop/TestBinning/mzpos_All_09022021.txt",)

In [3]:
print("shape of data:",data.shape)

print("datatype of data:",data.dtype)

shape of data: (416137, 2)
datatype of data: float64


In [4]:
#get the mass column in mass
mass=data[:,0]

In [5]:
density=data[:,1]

In [6]:
ranges=np.arange(np.min(mass), np.max(mass)+0.0001, 0.0001)  #[0, 0.1, 0.2, 0.3 ...]
ranges=np.round(ranges , decimals=4)

ranges

array([  50.058 ,   50.0581,   50.0582, ..., 1192.1485, 1192.1486,
       1192.1487])

#median takes more time because by defult it flattens each  group orn bin 
You're right, the issue is that axis=0 isn't passed in by default (and perhaps it should be... although it may be difficult to know what to pass in as default for arbitrary functions?).

As median flattens the array (and produces a number), and since:

test_g.aggregate(lambda x: 8)
makes everything 8, this behaviour is "expected" in some sense (and sometimes might be what you want) so we probably don't want an exception...?

### With Scipy solution  

In [7]:
start_time = time.time()
mymed, myedges, mybins = stats.binned_statistic(mass,mass, statistic='median', bins=ranges)
print("--- %s seconds ---" % (time.time() - start_time))

--- 302.328901052475 seconds ---


In [8]:
mymed.shape

(11420907,)

In [9]:
mymed

array([  50.058 ,   50.0581,   50.0582, ...,       nan,       nan,
       1192.1487])

In [10]:
mymed, myedges, mybins = stats.binned_statistic(mass,density, statistic='sum', bins=ranges)

In [11]:
mymed

array([0.23, 0.22, 0.21, ..., 0.  , 0.  , 0.07])

In [12]:
mymed.shape

(11420907,)

In [14]:
#preffered way
def twocalls():
    means, bin_edges, binnumber = stats.binned_statistic(
        x, values=x, statistic='mean', bins=bins)
    medians, bin_edges, binnumber = stats.binned_statistic(
        x, values=x, statistic='median', bins=bins)
    return means, medians

### With numpy.digitize solution

In [13]:

start_time = time.time()
### With numpy.digitize
import numpy as np
#get the mass column in mass
mass=data[:,0]
density=data[:,1]
bins=ranges
#x = np.tile(np.array([0.2, 9., 6.4, 3.0, 1.6]), 100000)
#bins = np.array([0.0, 1.0, 2.5, 10.0])

def binstats(x,y, bins):
    
    inds = np.digitize(x, bins)
    statistics = []
    #statisticsSum = []


    
    binnumber = []
    seen = set()
    for bin_idx in inds:
        if bin_idx not in seen:
            bin_arr_mass = mass[inds==bin_idx]
            bin_arr_dens = density[inds==bin_idx]
            statistics.append([np.median(bin_arr_mass), np.median(bin_arr_dens)])
            binnumber.append(bin_idx)
            seen.add(bin_idx)
    return statistics, binnumber

statistics, binnumber = binstats(mass,density, bins)
#for (mean, median), bin_idx in zip(statistics, binnumber):
#   print('{b}: {mean:.2f} {median:.2f}'.format(b=bin_idx, mean=mean, median=median))
print("--- %s seconds ---" % (time.time() - start_time))

--- 152.00651907920837 seconds ---


In [15]:
print(statistics[:3]) 

[[50.058, 0.23], [50.0581, 0.22], [50.0582, 0.21]]


### References
https://numpy.org/doc/stable/reference/generated/numpy.digitize.html

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html

https://stackoverflow.com/questions/23070329/can-numpys-digitize-function-output-the-mean-or-median


#### median calculations problems
https://github.com/pandas-dev/pandas/issues/1989