# Efficient `pandas`

In [None]:
%pylab inline
plt.style.use("bmh")

In [None]:
import numpy as np
import pandas as pd

In [None]:
import string

# Data

In [None]:
df = pd.DataFrame(np.arange(2000).reshape((1000,2)),
                  columns=['a', 'b'],
                  index=np.random.choice(list(string.ascii_lowercase), 1000, replace=True))

In [None]:
df.head()

In [None]:
df.shape

# Loops

## Naive

In [None]:
def iterate_df(df):
    """Iterate over df in Python loop."""

    result = []

    for i in range(df.shape[0]):
        row = df.iloc[i]
        result.append(row['a']/row['b'])
    return pd.Series(result, name="div_result", index=df.index)

In [None]:
%timeit -n 10 -r 5 iterate_df(df)

## Using `iterrows`

In [None]:
def iterate_df_rows(df):
    """Iterate over df in Python loop."""

    result = []

    for ri, row in df.iterrows():
        result.append(row['a']/row['b'])
    return pd.Series(result, name="AgeGroup", index=df.index)

In [None]:
%timeit -n 10 -r 5 iterate_df_rows(df)

## Using `apply`

In [None]:
%timeit -n 20 -r 5 df.apply(lambda x: x['a']/x['b'], axis=1)

## Using vectorization

In [None]:
%timeit -n 20 -r 5 df['a']/df['b']

In [None]:
df['a']/df['b']

In [None]:
%timeit -n 20 -r 5 df['a'].values/df['b'].values

In [None]:
df['a'].values/df['b'].values

# Memory

In [None]:
titanic_train = pd.read_csv("train.csv", index_col="PassengerId")
titanic_test = pd.read_csv("test.csv", index_col="PassengerId")
titanic = pd.concat([titanic_train, titanic_test], sort=False)

titanic.head(5)

In [None]:
titanic.dtypes

In [None]:
titanic.info(memory_usage="deep")

In [None]:
titanic["Pclass"] = pd.to_numeric(titanic["Pclass"], downcast="unsigned")

In [None]:
titanic.dtypes

In [None]:
titanic.info(memory_usage="deep")

In [None]:
titanic["SibSp"] = pd.to_numeric(titanic["SibSp"], downcast="unsigned")
titanic["Parch"] = pd.to_numeric(titanic["Parch"], downcast="unsigned")

In [None]:
titanic.info(memory_usage="deep")

## How to read it with correct `dtype`s right away?

In [None]:
def efficient_read_df(filepath):
    return pd.read_csv(filepath, index_col="PassengerId",
                                  dtype={"Pclass":np.uint8,
                                         "SibSp":np.uint8,
                                         "Parch":np.uint8,
                                         "Survived":np.float32,
                                         "Age":np.float32,
                                         "Fare":np.float32},
                                  converters={"Sex": lambda x: (x=="female"), # boolean
                                              "Embarked": lambda x: 0 if x=="S" else (1 if x=="C" else 2)})

In [None]:
titanic_train_trunc = efficient_read_df("train.csv")
titanic_test_trunc = efficient_read_df("test.csv")
titanic_trunc = pd.concat([titanic_train_trunc, titanic_test_trunc], sort=False)
titanic_trunc.info(memory_usage="deep")