In [None]:
# Comparison DASK, SWIFTER and NORMAL APPLY

# Sample data
data=pd.DataFrame(np.random.rand(1000000,100),columns=['col'+str(x) for x in range(100)])
data['id']=np.random.randint(1000,1000000)

# NORMAL  APPLY
%time data['resultnormal']=data.apply(lambda row : 0 if (row['col0']+row['col1']+row['col3']+row['col4']+row['col5']) < 0.5 else 1 if (row['col2']+row['col4']+row['col6']+row['col9']+row['col10']<0.8) else 2, axis=1)
# 1min 28s

# DASK  APPLY
import dask.dataframe as dd
ddf = dd.from_pandas(data, npartitions=128)
def myFunc(row):
    if(row['col0']+row['col1']+row['col3']+row['col4']+row['col5'] < 0.5):
        return(0)
    elif(row['col2']+row['col4']+row['col6']+row['col9']+row['col10']<0.8):
        return(1)
    else:
        return(2)

ddf['resultdask']=ddf.apply(myFunc, axis=1)
%time ddf.compute()
# 1min 33s 4 partitions
# 1min 29s 8 partitions
# 1min 23s 16 partitions
# 1min 29s 128 partitions

# SWIFTER
import swifter
%time data['resultnormal'] = data.swifter.apply(lambda row : 0 if (row['col0']+row['col1']+row['col3']+row['col4']+row['col5']) < 0.5 else 1 if (row['col2']+row['col4']+row['col6']+row['col9']+row['col10']<0.8) else 2, axis=1)
# 41.5 s

# NORMAL

### Normal Pandas Apply happens sequentially

https://github.com/pandas-dev/pandas/blob/master/pandas/core/apply.py

    klass: Type[FrameApply]
    if axis == 0:
        klass = FrameRowApply
    elif axis == 1:
        klass = FrameColumnApply
        
### The function that is called is the apply_broadcast which uses a FOR loop
def apply_broadcast(self, target: "DataFrame") -> "DataFrame":
        result_values = np.empty_like(target.values)

        # axis which we want to compare compliance
        result_compare = target.shape[0]

        for i, col in enumerate(target.columns):
            res = self.f(target[col])
            ares = np.asarray(res).ndim

            # must be a scalar or 1d
            if ares > 1:
                raise ValueError("too many dims to broadcast")
            elif ares == 1:

                # must match return dim
                if result_compare != len(res):
                    raise ValueError("cannot broadcast result")

            result_values[:, i] = res
            
# DASK

### DASK is a behemoth which consists of a lot of configurations and parameters

#### n_partitions
This allows the number of separate groups that the indices of the pandas dataframe will be split into. It allows for separating out the chunks of the dataframe

#### ddf.apply and compute
Dask apply is a lazy eval. This means that executing the apply does not lead to a direct execution. Instead it just creates the execution graph. Based on the scheduler chosen, the parallelization happens
The default scheduler for DASK is THREADS

# SWIFTER

### SWIFTER is nothing but a small scale dask
It has two main defaults that makes it seem faster
 * Scheduler : Processes instead of Threads
 * Partitions : cpu_count() * 2

### You can arrive at the same performance level with DASK with the following changes
```
ddf.map_partitions(myFunc,axis=1,meta=('resultdask',np.float32))
with dask.config.set({"multiprocessing.context": "fork"}):
    %time ddf.compute(scheduler="processes")
    
#41 seconds
```