In [514]:
import numpy as np
from numba import jit
import pandas as pd
from tqdm.notebook import trange
from groupby import GroupBy

In [2]:
def make_data(n,m,i):
    arr = np.random.randint(i, size=n*m).reshape((n,m))
    df = pd.DataFrame(arr)
    return arr, df

In [534]:
# ar1, df1 = make_data(1000, 10, 10)
# ar2, df2 = make_data(1000, 10, 10)
ar1, df1 = make_data(30, 5, 3)
ar2, df2 = make_data(30, 5, 3)

# l_on, r_on = [2,3,4], [6,8,9]
l_on, r_on = [0,1,2], [0,1,2]
gb1 = GroupBy(ar1, l_on)
gb2 = GroupBy(ar2, r_on)

df1.merge(df2,  how='inner', left_on=l_on, right_on=r_on)

Unnamed: 0,0,1,2,3_x,4_x,3_y,4_y
0,0,0,1,1,1,0,2
1,0,0,2,2,2,0,1
2,0,0,2,2,2,1,1
3,0,0,2,2,2,1,0
4,0,1,1,1,0,2,1
5,0,1,1,1,0,1,1
6,0,1,1,0,1,2,1
7,0,1,1,0,1,1,1
8,0,1,1,1,1,2,1
9,0,1,1,1,1,1,1


In [535]:
class Merge:
    def __init__(self, l, r, l_on, r_on):
        """
            The Merge object implements different types of merges between two arrays. It utilizes
            the sorting an group-finding characterstics of the GroupBy object to facilitate the
            merges, as well as functions from NumPy's setops suite.
        """
        
        self.l_on = l_on
        self.r_on = r_on
        
        # Merge supports l and r parameters to be either arrays, or GroupBy objects. 
        # If they are arrays, we need to perform GroupBy initialization to get keys
        if isinstance(l, GroupBy) and (l.by == l_on):
            self.l_gb = l
        else:
            self.l_gb = GroupBy(l, l_on)
            
        if isinstance(r, GroupBy) and (r.by == r_on):
            self.r_gb = r
        else:
            self.r_gb = GroupBy(r, r_on)
        
    
    def inner(self):
        """ inner join on the specified columns of the Merge object """
        
        # We'll need contiguous arrays to get the proper view of our keys
        l_keyc, r_keyc = np.ascontiguousarray(self.l_gb.keys), np.ascontiguousarray(self.r_gb.keys)
        dtype = [(f'{i}', l_keyc.dtype) for i in range(l_keyc.shape[1])] # l_gb.on and r_gb.on should be of same length
        # Get a view of the keys (the one stored in the GroupBy objects may have differently named fields)
        l_keyv, r_keyv = l_keyc.view(dtype)[:, 0], r_keyc.view(dtype)[:, 0]
        
        # Find the intersection between the two key-sets and the indices in each set for those intersections
        intersect, l_idx, r_idx = np.intersect1d(l_keyv, r_keyv, 
                                                 assume_unique=True, return_indices=True)

        return _inner_merge_njit(idx=self.l_gb.idx, l_vals=self.l_gb.vals, r_vals=self.r_gb.vals, 
                           n_intersect=intersect.shape[0], l_gr_idx=self.l_gb.gr_idx,
                           r_gr_idx=self.r_gb.gr_idx, l_idx=l_idx, r_idx=r_idx)
#         # Make the results object of the proper proportions
#         l_counts = np.diff(self.l_gb.gr_idx)[l_idx] # get counts per group
#         r_counts = np.diff(self.r_gb.gr_idx)[r_idx]
        
#         group_n = l_counts * r_counts
#         res_group_i = np.concatenate((np.array([0]), np.cumsum(group_n)))
        
#         n = res_group_i[-1] # number of total rows is the dot product of the each merge-group's lengths
#         m = self.l_gb.arr.shape[1] + self.r_gb.vals.shape[1]

#         res = np.empty((n,m), dtype=self.l_gb.arr.dtype)
        
#         idx_end = len(self.l_on)
#         l_end = idx_end + self.l_gb.vals.shape[1]

#         for i in range(intersect.shape[0]):

#             index = self.l_gb.idx[self.l_gb.gr_idx[l_idx[i]]]
            
#             l_arr = self.l_gb.vals[self.l_gb.gr_idx[l_idx[i]]:self.l_gb.gr_idx[l_idx[i]+1]]
#             r_arr = self.r_gb.vals[self.r_gb.gr_idx[r_idx[i]]:self.r_gb.gr_idx[r_idx[i]+1]] # no index appended to right-side

#             ln = l_counts[i]
#             rn = r_counts[i]
#             n = group_n[i]
#             row_start_i = res_group_i[i]
#             row_end_i = res_group_i[i+1]
            
#             res_gr = res[row_start_i:row_end_i]
            
#             res_gr[:, :idx_end] = index
            
#             l_group_i = np.arange(ln+1) * rn # positions of each new row of l_arr in res_gr

#             for j in range(ln):
#                 res_gr[l_group_i[j]:l_group_i[j+1], idx_end:l_end] = l_arr[j]
#                 res_gr[l_group_i[j]:l_group_i[j+1], l_end:] = r_arr
                
#         return res

In [536]:
@jit
def _inner_merge_njit(idx, l_vals, r_vals, n_intersect, l_gr_idx, r_gr_idx, l_idx, r_idx):
    """ Helper jitted function for inner merge """
    # Make the results object of the proper proportions
    l_counts = np.diff(l_gr_idx)[l_idx] # get counts per group
    r_counts = np.diff(r_gr_idx)[r_idx]

    group_n = l_counts * r_counts
    res_group_i = np.concatenate((np.array([0]), np.cumsum(group_n)))

    n = res_group_i[-1] # number of total rows is the dot product of the each merge-group's lengths
    m = idx.shape[1] + l_vals.shape[1] + r_vals.shape[1]

    res = np.empty((n,m), dtype=idx.dtype)

    idx_end = idx.shape[1]
    l_end = idx_end + l_vals.shape[1]

    for i in range(n_intersect):

        index = idx[l_gr_idx[l_idx[i]]]

        l_arr = l_vals[l_gr_idx[l_idx[i]]:l_gr_idx[l_idx[i]+1]]
        r_arr = r_vals[r_gr_idx[r_idx[i]]:r_gr_idx[r_idx[i]+1]] # no index appended to right-side

        res_gr = res[res_group_i[i]:res_group_i[i+1]]

        res_gr[:, :idx_end] = index

        l_group_i = np.arange(l_counts[i]+1) * r_counts[i] # positions of each new row of l_arr in res_gr

        for j in range(l_counts[i]):
            res_gr[l_group_i[j]:l_group_i[j+1], idx_end:l_end] = l_arr[j]
            res_gr[l_group_i[j]:l_group_i[j+1], l_end:] = r_arr

    return res

In [542]:
Merge(gb1, gb2, l_on, r_on).inner()[5]

array([0, 1, 1, 1, 0, 1, 1])

In [544]:
df1.merge(df2,  how='inner', left_on=l_on, right_on=r_on).loc[5].values

array([0, 1, 1, 1, 0, 1, 1])

In [530]:
2      4
3      6
4      7
0_x    0
1_x    4
2_x    4
3_x    6
4_x    7
5_x    2
6_x    4
7_x    5
8_x    5
9_x    9
0_y    9
1_y    2
2_y    6
3_y    6
4_y    3
5_y    9
6_y    4
7_y    3
8_y    6
9_y    7

SyntaxError: invalid syntax (<ipython-input-530-be2c9f7202e1>, line 1)

In [526]:
%prun Merge(gb1, gb2, l_on, r_on).inner()#.shape

 

In [525]:
# %timeit Merge(ar1, ar2, l_on, r_on).inner()
%timeit Merge(gb1, gb2, l_on, r_on).inner()#.shape

567 µs ± 5.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [524]:
%timeit df1.merge(df2,  how='inner', left_on=l_on, right_on=r_on)

3.87 ms ± 24.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
