# Profiling

In [None]:
%time print('Hello, world.')

In [None]:
def count():
    for _ in range(100_000_000):
        pass
    print('Done.')

%time count()
%timeit count()

# tqdm

In [None]:
import time
from tqdm import tnrange, tqdm_notebook as tqdm

def sleep():
    for i in tqdm(range(10)):
        time.sleep(1)
    print('Done.')
sleep()

# Some data

In [None]:
pd.read_csv('insurance.csv').head()

In [None]:
pd.read_csv('creditcard_short.csv').head()

In [None]:
import random
import string

import pandas as pd

def gen_id():
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))

def gen_df(n=1_000_000):
    return pd.DataFrame([gen_id() for _ in range(n)], columns=['id'])

df = gen_df()
df.head()

# dtypes https://docs.scipy.org/doc/numpy/user/basics.types.html

In [None]:
df = pd.read_csv('insurance.csv')
df.info(verbose=False, memory_usage='deep')
df.head()

In [None]:
df['age'] = df['age'].astype('uint8')
df['children'] = df['children'].astype('uint8')
df.info(verbose=False, memory_usage='deep')

In [None]:
df['bmi'] = df['bmi'].astype('float32')
df['charges'] = df['charges'].astype('float32')
df.info(verbose=False, memory_usage='deep')

In [None]:
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1}).astype('uint8')
df.info(verbose=False, memory_usage='deep')

# Categoricals https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html

In [None]:
df.select_dtypes(include=['object']).describe()

In [None]:
df['region'].astype('category').head()

In [None]:
for c in df.select_dtypes(include=[object]).columns:
    df[c] = df[c].astype('category')
df.info(verbose=False, memory_usage='deep')
df.dtypes

In [None]:
df['smoker'] = df['smoker'].map({0: 'no', 1: 'yes'}).astype('category')
df.info(verbose=False, memory_usage='deep')

In [None]:
df.head()

In [None]:
df.iloc[0]['smoker']

In [None]:
df['smoker'].nbytes

In [None]:
df['smoker'].map({'no': 0, 'yes': 1}).astype('uint8').nbytes

In [None]:
df['region'].dtype

In [None]:
regions = df['region'].copy()
regions[0] = 'east'

In [None]:
regions = regions.cat.add_categories(['east'])
regions[0] = 'east'
regions.dtype

In [None]:
regions.apply(lambda s: s.upper()).head()

In [None]:
df = pd.read_csv('insurance.csv', dtype={
    'age': 'uint8',
    'sex': 'category',
    'bmi': 'float32',
    'children': 'uint8',
    'smoker': 'category',
    'region': 'category',
    'charges': 'float32'
})
df.info(verbose=False, memory_usage='deep')
df.head()

In [None]:
df = gen_df()
df.info(verbose=False, memory_usage='deep')
df['id'] = df['id'].astype('category')
df.info(verbose=False, memory_usage='deep')

In [None]:
df = pd.DataFrame(['Jul 31, 2009', '2010-01-10', None], columns=['dtm'])
df.info(verbose=False, memory_usage='deep')
df['dtm'] = pd.to_datetime(df['dtm'])
df.info(verbose=False, memory_usage='deep')

# Looping

In [None]:
cc_df = pd.read_csv('creditcard_short.csv')
cc_df.info(verbose=False, memory_usage='deep')

In [None]:
def naive(df):
    for i in range(0, len(df)):
        df.loc[i, 'Amount'] = df.loc[i, 'Amount'] * 100
    return df

df = cc_df.copy()
%time df = naive(df)
df['Amount'].sum()

In [None]:
def naive_assign(df):
    amounts = []
    for i in range(0, len(df)):
        amounts.append(df.iloc[i]['Amount'] * 100)
    df['Amount'] = amounts
    return df

df = cc_df.copy()
%time df = naive_assign(df)
df['Amount'].sum()

In [None]:
def iterrows(df):
    amounts = []
    for index, row in df.iterrows():
        amounts.append(row['Amount'] * 100)
    df['Amount'] = amounts
    return df
                   
df = cc_df.copy()
%time df = iterrows(df)
df['Amount'].sum()

In [None]:
def apply(df):
    df['Amount'] = df.apply(lambda r: r['Amount'] * 100, axis=1)
    return df

df = cc_df.copy()
%time df = apply(df)
df['Amount'].sum()

In [None]:
def apply_col(df):
    df['Amount'] = df['Amount'].apply(lambda a: a * 100)
    return df

df = cc_df.copy()
%time df = apply_col(df)
df['Amount'].sum()

In [None]:
def vectorize(df):
    df['Amount'] = df['Amount'] * 100
    return df
    
df = cc_df.copy()
%time df = vectorize(df)
df['Amount'].sum()

In [None]:
def vectorize_numpy(df):
    df['Amount'] = df['Amount'].values * 100
    return df
    
df = cc_df.copy()
%time df = vectorize_numpy(df)
df['Amount'].sum()

# Parallelization

In [None]:
df = gen_df()
%time df['id'].str.contains('C').sum()

In [None]:
import regex
from tqdm import tqdm
tqdm.pandas()

def search(s):
    return regex.search('(?<!^C)*C(?!^C)*', s) is not None

In [None]:
%time df['id'].apply(search).sum()

## multiprocessing

In [None]:
import numpy as np

from multiprocessing import Pool as ProcessPool

def contains(df):
    return df['id'].apply(search).sum()

def parallelize():
    with ProcessPool(processes=4) as pool:
        split = np.array_split(df, 4)
        return sum(pool.map(contains, split))

%time parallelize()

In [None]:
from multiprocessing.dummy import Pool as ThreadPool

def contains(df):
    return df['id'].apply(search).sum()

def parallelize_threads():
    with ThreadPool(processes=4) as pool:
        split = np.array_split(df, 4)
        return sum(pool.map(contains, split))

%time parallelize_threads()

In [None]:
from concurrent.futures import ProcessPoolExecutor

def concurrent_futures():
    with ProcessPoolExecutor(max_workers=4) as executor:
        split = np.array_split(df, 4)
        return sum(executor.map(contains, split))

%time concurrent_futures()

## pandarallel https://github.com/nalepae/pandarallel/blob/master/pandarallel/dataframe.py#L54

In [None]:
from pandarallel import pandarallel
pandarallel.initialize()

%time df['id'].parallel_apply(search).sum()

# Appendix

## Dask

In [None]:
import dask.dataframe as dd

%time dd.from_pandas(df, npartitions=4)['id'].apply(search, meta=pd.Series(dtype='bool', name='match')).sum().compute()

## Generators

In [None]:
import sys

print(sys.getsizeof(range(1000)))
print(sys.getsizeof(list(range(1000))))

In [None]:
def fib(n=1000):
    x, y = 0, 1
    
    for _ in range(n):
        yield x
        x, y = y, x + y
fib()

In [None]:
print(sys.getsizeof(fib()))
print(sys.getsizeof(list(fib())))

## Asyncio

In [None]:
import requests

def get(url):
    requests.get(url)

%time for _ in range(20): get('https://google.com')

In [None]:
import multiprocessing

def processes():
    with multiprocessing.Pool(processes=4) as pool:
        pool.map(get, ['https://google.com'] * 20)

%time processes()

## Numba