# DataFrame Loop Optimization

In [None]:
# common imports
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Let's start out by creating a DataFrame with 100,000 review scores similar to the review scores we discussed in the AirBnb data.

In [None]:
#set random seed
np.random.seed(42)

# create sample DataFrame of 100,000 review scores
df = pd.DataFrame(np.random.randint(0,100, size = 100000), 
                  columns=['review_score'])

df

Next, let's say that we want to add a new column called `review_stars` that follows the following mapping:

- 0 - 20: 'One Star'

- 21 - 40: 'Two Stars'

- 41 - 60: 'Three Stars'

- 61 - 80: 'Four Stars'

- 81 - 100: 'Five Stars'

*Note: We will not worry about creating categorical data types for this example.

## Basic Loops

Let's first use a basic Python loop to perform this task and calculate the total computation time.

In [None]:
def basic_loop(df):
    # create an empty column to use in the loop
    df['review_stars'] = ''
    
    # loop through review_scores, creating new review_stars column
    for value in range(len(df)):
        if df['review_score'][value] >= 0 and df['review_score'][value] <= 20:
            df['review_stars'].iloc[value] = 'One Star'
        
        elif df['review_score'][value] >= 21 and df['review_score'][value] <= 40:
            df['review_stars'].iloc[value] = 'Two Stars'
        
        elif df['review_score'][value] >= 41 and df['review_score'][value] <= 60:
            df['review_stars'].iloc[value] = 'Three Stars'
        
        elif df['review_score'][value] >= 61 and df['review_score'][value] <= 80:
            df['review_stars'].iloc[value] = 'Four Stars'
        
        else: df['review_stars'].iloc[value] = 'Five Stars'

In [None]:
%%timeit
basic_loop(df)

In [None]:
# check DataFrame
df

## Iterrows

From Pandas documentation:
- The Pandas' method `iterrows` iterates over the DataFrame rows as (Index, Series) pairs. 
- Because iterrows returns a Series for each row, it does not preserve dtypes across the rows (dtypes are preserved across columns for DataFrames)
- To preserve dtypes while iterating over the rows, it is better to use `itertuples()` which returns named tuples of the values and which is generally faster than iterrows
- You should never modify something you are iterating over

In [None]:
# Documentation example to show dtypes
df2 = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
df2

In [None]:
# check data types
df2.info()

In [None]:
row = next(df2.iterrows())[1]
row

In [None]:
# check data types again
print(row['int'].dtype)
print(df2['int'].dtype)

In [None]:
count = 0
for i, s in df.iterrows():
    print(i)
    print('----')
#     print(s)
    print(s[0])
    print('--------')
    count += 1
    if count == 5:
        break

In [None]:
# create function to work with iterrows()
def iterrows_loop(i,s):
        if s[0] >= 0 and s[0] <= 20:
            return 'One Star'
        elif s[0] >= 21 and s[0] <= 40:
            return 'Two Stars'
        elif s[0] >= 41 and s[0] <= 60:
            return 'Three Stars'
        elif s[0] >= 61 and s[0] <= 80:
            return 'Four Stars'
        else: return 'Five Stars'

In [None]:
%%timeit

# create empty list
output_list = []

# iterate over rows
for i, s in df.iterrows():
    output_list.append(iterrows_loop(i, s))

# add list as new column
df['review_stars2'] = output_list

## Using Apply()

In [None]:
# create function to work with apply
def buckets(x):
    if x >= 0 and x <= 20:
        return 'One Star'
    elif x >= 21 and x <= 40:
        return  'Two Stars'
    elif x >= 41 and x <= 60:
        return  'Three Stars'
    elif x >= 61 and x <= 80:
        return  'Four Stars'
    else: return  'Five Stars'

In [None]:
%%timeit
df['review_stars3'] = df.apply(lambda row: buckets(row['review_score']), axis=1)

## Intertuples()

Iterate over DataFrame rows as named tuples.

In [None]:
count = 0

for row in df.itertuples():
#     print(row)
    print(row[1])
    print('----')
    count += 1
    if count == 5:
        break

In [None]:
# create function to work with intertuples()
def itertuples_loop(r):
        if r[1] >= 0 and r[1] <= 20:
            return 'One Star'
        elif r[1] >= 21 and r[1] <= 40:
            return 'Two Stars'
        elif r[1] >= 41 and r[1] <= 60:
            return 'Three Stars'
        elif r[1] >= 61 and r[1] <= 80:
            return 'Four Stars'
        else: return 'Five Stars'

In [None]:
%%timeit

# create empty list
output_list = []

# iterate using itertuples
for row in df.itertuples():
    output_list.append(itertuples_loop(row))

# create new column using list
df['review_stars4'] = output_list

## np.where()

In [None]:
%%timeit

df['review_stars5'] = np.where((df['review_score'] >= 0) & (df['review_score'] <= 20), 'One Star', 
                         np.where((df['review_score'] >= 21) & (df['review_score'] <= 40), 'Two Stars',
                         np.where((df['review_score'] >= 41) & (df['review_score'] <= 60),'Three Stars', 
                         np.where((df['review_score'] >= 61) & (df['review_score'] <= 80),'Four Stars',
                         'Five Stars'))))

## Pandas Vectorization

https://stackoverflow.com/questions/1422149/what-is-vectorization

In [None]:
def pd_vector(score):
    df.loc[(score >= 0) & (score <= 20), 'review_scores6'] = 'One Star'
    df.loc[(score >= 21) & (score <= 40), 'review_scores6'] = 'Two Stars'
    df.loc[(score >= 41) & (score <= 60), 'review_scores6'] = 'Three Stars'
    df.loc[(score >= 61) & (score <= 80), 'review_scores6'] = 'Four Stars'
    df.loc[(score >= 81),'review_scores6'] = 'Five Stars'

In [None]:
%%timeit
pd_vector(df['review_score'])

## Numpy Vectorization

In [None]:
def pd_vector_array(score):
    df.loc[(score >= 0) & (score <= 20), 'review_scores7'] = 'One Star'
    df.loc[(score >= 21) & (score <= 40), 'review_scores7'] = 'Two Stars'
    df.loc[(score >= 41) & (score <= 60), 'review_scores7'] = 'Three Stars'
    df.loc[(score >= 61) & (score <= 80), 'review_scores7'] = 'Four Stars'
    df.loc[(score >= 81),'review_scores7'] = 'Five Stars'

In [None]:
%%timeit
pd_vector_array(df['review_score'].values)

## Binning 

In [None]:
%%timeit
df.loc[:,'review_stars8'] = pd.cut(x=df['review_score'], bins=[0, 20, 40, 60, 80, 100],
                                            labels=['One Star','Two Stars','Three Stars','Four Stars','Five Stars'])

## View Final DataFrame

In [None]:
df