In [None]:
import pandas as pd
import numpy as np
from math import *

### Read in the data

In [None]:
df = pd.read_csv('../_data/new_york_hotels.csv', encoding='cp1252')

In [None]:
df.head()

## Benchmarking example

#### Define the normalization function

In [None]:
def normalize(df, pd_series):
    pd_series = pd_series.astype(float)

    # Find upper and lower bound for outliers
    avg = np.mean(pd_series)
    sd  = np.std(pd_series)
    lower_bound = avg - 2*sd
    upper_bound = avg + 2*sd

    # Collapse in the outliers
    df.loc[pd_series < lower_bound , "cutoff_rate" ] = lower_bound
    df.loc[pd_series > upper_bound , "cutoff_rate" ] = upper_bound

    # Finally, take the log
    normalized_price = np.log(df["cutoff_rate"].astype(float))
    
    return normalized_price

#### Timing the normalization function

In [None]:
%timeit df['high_rate_normalized'] = normalize(df, df['high_rate'])

#### Profiling the normalization function

In [None]:
# !pip install line-profiler

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f normalize df['high_rate_normalized'] = normalize(df, df['high_rate'])

In [None]:
def normalize_2(df, pd_series):
    pd_series = pd_series.astype(float)

    # Find upper and lower bound for outliers
    avg = np.mean(pd_series)
    sd  = np.std(pd_series)
    lb = lower_bound = avg - 2*sd
    ub = upper_bound = avg + 2*sd

    # Collapse in the outliers
    x = pd_series.values
    df["cutoff_rate"] = np.piecewise(x, 
                                     [x < lb, ((lb <= x) & (x < ub)), x >= ub], [lb, lambda x: x, ub])

    # Finally, take the log
    normalized_price = np.log(df["cutoff_rate"].astype(float))
    
    return normalized_price

In [None]:
normalize_2(df, df['high_rate']);

In [None]:
%lprun -f normalize_2 df['high_rate_normalized'] = normalize_2(df, df['high_rate'])

## Haversine definition

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

### Iterrows implementation of Haversine

In [None]:
%%timeit
# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

### Apply Haversine on rows

#### Timing "apply"

In [None]:
%timeit df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

#### Profiling "apply"

In [None]:
# Haversine applied on rows
%lprun -f haversine df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

### Pandas implementation of Haversine

#### Timing vectorized implementation

In [None]:
# Vectorized implementation of Haversine applied on Pandas series
%timeit df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

#### Profiling vectorized implementation

In [None]:
# Vectorized implementation profile
%lprun -f haversine haversine(40.671, -73.985, df['latitude'], df['longitude'])

### NumPy arrays implementation of Haversine

#### Timing vectorized implementation

In [None]:
# Vectorized implementation of Haversine applied on NumPy arrays
%timeit df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

In [None]:
%%timeit
# Convert pandas arrays to NumPy ndarrays
np_lat = df['latitude'].values
np_lon = df['longitude'].values

#### Profiling vectorized implementation

In [None]:
%lprun -f haversine df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

## Cythonize that loop

#### Load the cython extension

In [None]:
%load_ext cython

#### Run unaltered Haversine through Cython

In [None]:
%%cython -a

# Haversine cythonized (no other edits)
import numpy as np

cpdef haversine_cy(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

#### Time it Cythonized apply vs. normal apply

In [None]:
%timeit df['distance'] = df.apply(lambda row: haversine_cy(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

In [None]:
%timeit df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

#### Redefine Haversine with data types and C libraries

In [None]:
%%cython -a
# Haversine cythonized
from libc.math cimport sin, cos, acos, asin, sqrt

cdef deg2rad_cy(float deg):
    cdef float rad
    rad = 0.01745329252*deg
    return rad
    
cpdef haversine_cy_dtyped(float lat1, float lon1, float lat2, float lon2):
    cdef: 
        float dlon
        float dlat
        float a
        float c
        float mi
    
    lat1, lon1, lat2, lon2 = map(deg2rad_cy, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    mi = 3959 * c
    return mi


#### Time it Cythonized vs. numpy implementation

In [None]:
%timeit df['distance'] = df.apply(lambda row: haversine_cy_dtyped(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

In [None]:
# Vectorized implementation of Haversine applied on NumPy arrays
%timeit df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

#### Profiling Cythonized vs. numpy implementation

In [None]:
%lprun -f haversine_cy_dtyped df.apply(lambda row: haversine_cy_dtyped(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

In [None]:
%lprun -f haversine df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)