In [1]:
%load_ext nb_black

In [2]:
import pandas as pd
import numpy as np
import random

<IPython.core.display.Javascript object>

In [3]:
ex_df = pd.DataFrame(
    {
        "State": ["Alabama", "Alaska", "Arizona", "Arkansas", "California"],
        "Region": ["East", "West", "West", "East", "West"],
    }
)

<IPython.core.display.Javascript object>

In [4]:
ex_df

Unnamed: 0,State,Region
0,Alabama,East
1,Alaska,West
2,Arizona,West
3,Arkansas,East
4,California,West


<IPython.core.display.Javascript object>

In [5]:
class indexby:
    def __init__(self, df, index_column, index_val, partition_column):
        self.df = df
        self.index_column = index_column
        self.index_val = index_val
        self.partition_column = partition_column
        self.part_dict = {}
        for val in df[self.partition_column].unique():
            self.part_dict[val] = df[df[self.partition_column] == val][
                self.index_column
            ].reset_index(drop=True)[self.index_val - 1]

    def set_value(self, partition):
        return self.part_dict[partition]

    def execute(self):
        return np.vectorize(self.set_value)(self.df[self.partition_column])

<IPython.core.display.Javascript object>

In [6]:
indexby(ex_df, "State", 1, "Region").execute()

array(['Alabama', 'Alaska', 'Alaska', 'Alabama', 'Alaska'], dtype='<U7')

<IPython.core.display.Javascript object>

## Testing Dictionary Creation Methods in hopes of avoiding for loop

### For Loop

In [7]:
def run_for_loop(ex_df):
    part_dict = {}
    for val in ex_df["Region"].unique():
        part_dict[val] = ex_df[ex_df["Region"] == val]["State"].reset_index(drop=True)[
            0
        ]
    return

<IPython.core.display.Javascript object>

In [8]:
%timeit run_for_loop(ex_df)

864 µs ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<IPython.core.display.Javascript object>

### List Comprehension

In [9]:
def run_list_comp(ex_df):
    part_dict = {
        val: ex_df[ex_df["Region"] == val]["State"].reset_index(drop=True)[0]
        for val in ex_df["Region"].unique()
    }
    return part_dict

<IPython.core.display.Javascript object>

In [10]:
%timeit run_list_comp(ex_df)

841 µs ± 11.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<IPython.core.display.Javascript object>

### Numpy Vectorize + pd.to_dict()

In [11]:
df = ex_df
partition_column = "Region"


def get_ib(val):
    index_column = "State"
    index_val = 1
    return df[df[partition_column] == val][index_column].reset_index(drop=True)[
        index_val - 1
    ]


def gen_dict(df, partition_column):
    df = pd.DataFrame(
        {
            "Partition": pd.Series(df[partition_column].unique()),
            "Values": pd.Series(np.vectorize(get_ib)(df[partition_column].unique())),
        }
    )
    return pd.Series(df.Values.values, index=df.Partition.values).to_dict()

<IPython.core.display.Javascript object>

In [12]:
%timeit gen_dict(df, partition_column)

2.07 ms ± 33.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<IPython.core.display.Javascript object>

## Creating Class which implements list comprehension dictionary creation method.

In [13]:
class indexby_lc:
    def __init__(self, df, index_column, index_val, partition_column):
        self.df = df
        self.index_column = index_column
        self.index_val = index_val
        self.partition_column = partition_column
        self.part_dict = {}

        self.part_dict = {
            val: self.df[self.df[self.partition_column] == val][
                self.index_column
            ].reset_index(drop=True)[self.index_val - 1]
            for val in self.df[self.partition_column].unique()
        }

    def set_value(self, partition):
        return self.part_dict[partition]

    def execute(self):
        return np.vectorize(self.set_value)(self.df[self.partition_column])

<IPython.core.display.Javascript object>

## Creating Class which implements the numpy vectorize + pd.to_dict() dictionary creation method.

In [14]:
class indexby_numpy:
    def __init__(self, df, index_column, index_val, partition_column):
        self.df = df
        self.index_column = index_column
        self.index_val = index_val
        self.partition_column = partition_column

    def get_ib(self, val):
        return self.df[self.df[self.partition_column] == val][
            self.index_column
        ].reset_index(drop=True)[self.index_val - 1]

    def gen_dict(self):
        df = pd.DataFrame(
            {
                "Partition": pd.Series(self.df[self.partition_column].unique()),
                "Values": pd.Series(
                    np.vectorize(self.get_ib)(self.df[self.partition_column].unique())
                ),
            }
        )
        self.part_dict = pd.Series(
            df.Values.values, index=df.Partition.values
        ).to_dict()
        return

    def set_value(self, partition):
        return self.part_dict[partition]

    def execute(self):
        self.gen_dict()
        return np.vectorize(self.set_value)(self.df[self.partition_column])

<IPython.core.display.Javascript object>

## Compare for loop, list comp, and numpy vectorize for speed through entire process.

In [15]:
%timeit indexby(ex_df, 'State', 1, 'Region').execute()

948 µs ± 21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<IPython.core.display.Javascript object>

In [16]:
%timeit indexby_lc(ex_df, 'State', 1, 'Region').execute()

943 µs ± 26.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


<IPython.core.display.Javascript object>

In [17]:
%timeit indexby_numpy(ex_df, 'State', 1, 'Region').execute()

2.2 ms ± 44.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<IPython.core.display.Javascript object>

## Testing for loop, list comp, and numpy vectorize when given 100,000 unique values in partition column

#### This is due to the length of unique values directly impacting the time complexity of the dictionary generation process.

In [18]:
df = pd.DataFrame(
    {
        "State": ["Alabama", "Alaska", "Arizona", "Arkansas", "California"] * 20000,
        "Region": ["East", "West", "West", "East", "West"] * 20000,
        "Rand": random.sample(range(1, 1000000), 100000),
    }
)

<IPython.core.display.Javascript object>

In [19]:
%timeit indexby(df, 'State', 1, 'Rand').execute()

54 s ± 991 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<IPython.core.display.Javascript object>

In [20]:
%timeit indexby_lc(df, 'State', 1, 'Rand').execute()

54.2 s ± 909 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<IPython.core.display.Javascript object>

In [21]:
%timeit indexby_numpy(df, 'State', 1, 'Rand').execute()

52.8 s ± 145 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<IPython.core.display.Javascript object>