# Looping through pandas dataframes

There are a variety of ways to loop through pandas dataframes - however they can result in vastly different performance!

In [5]:
from pathlib import Path

import numpy as np
import pandas as pd

from IPython.display import Markdown, display


In [6]:
def printmd(string):
    display(Markdown(string))

In [77]:
dataPath = Path("../data/raw/")
interimDataPath = Path("../data/interim/")

def loadData(duplicateN:int=1):
    df = pd.read_csv(dataPath / "atlanticInterpolated.csv")
    z = df.z.values
    dropColumns = []
    for col in df.columns:
        if df.loc[30, col] > df.loc[0, col] - 0.1:
            dropColumns.append(col)
    df = df.drop(columns=dropColumns)
    # For this example we will concatenate in rows instead of columns to generate a long dataframe
    df = pd.concat([df for _ in range(duplicateN)], axis=0)
    temps = df.iloc[:, 1:].values
    surfaceTemps = temps[:2, :].mean(axis=0)
    return df, temps, z, surfaceTemps

In [80]:
print("Load data")
df, temps, z, surfaceTemps = loadData(duplicateN=10)
# Reset the index to be unique
df.index = pd.RangeIndex(start=0, stop=len(df))
print("Data loaded")

Load data
Data loaded


# Loop using `iloc`

In [81]:
def iforLoop(df):
    mld = []
    col = '0:June:2020'
    for idx in range(df.shape[0]):
        value = df.iloc[idx].loc[col]
        if value > surfaceTemps[0]-0.1:
            mld.append(True)
        else:
            mld.append(False)
    df['mld'] = mld
    return df.loc[:,['z',col,'mld']]

# Loop using `itertuples`

In [84]:
def itertuplesLoop(df):
    mld = []
    col = '0:June:2020'
    for row in df.itertuples(index=False):
        value = row[df.columns.get_loc(col)]
        if value > surfaceTemps[0]-0.1:
            mld.append(True)
        else:
            mld.append(False)
    df['mld'] = mld
    return df.loc[:,['z',col,'mld']]

# Loop using the `index`

In [85]:
def forLoop(df):
    mld = []
    col = '0:June:2020'
    for idx in df.index:
        value = df.loc[idx,col]
        if value > surfaceTemps[0]-0.1:
            mld.append(True)
        else:
            mld.append(False)
    df['mld'] = mld
    return df.loc[:,['z',col,'mld']]


# Loop using `apply`

In [87]:
def rowFunc(row,col):
    value = row.loc[col]
    if value > surfaceTemps[0]-0.1:
        return True
    else:
        return False

def applyFunc(df):
    mld = []
    col = '0:June:2020'
    df['mld'] = df.loc[:,col].apply(lambda x: x > surfaceTemps[0]-0.1)
    return df.loc[:,['z',col,'mld']]


# Compare results

In [89]:
printmd("**iloc loop**")
%timeit -n 1 -r 3 iforLoop(df)
printmd("**itertuples loop**")
%timeit -n 1 -r 3 itertuplesLoop(df)
printmd("**loc loop**")
%timeit -n 1 -r 3 forLoop(df)
printmd("**apply loop**")
%timeit -n 1 -r 3 applyFunc(df)

**iloc loop**

2 s ± 43.7 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


**itertuples loop**

611 ms ± 19.3 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


**loc loop**

19.2 ms ± 62.9 µs per loop (mean ± std. dev. of 3 runs, 1 loop each)


**apply loop**

1.99 ms ± 245 µs per loop (mean ± std. dev. of 3 runs, 1 loop each)


## A factor of 1000x difference between these methods!