In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
df = sns.load_dataset("diamonds")

### Explore data

In [20]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_cat_loop
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,cheap
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [7]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [9]:
df.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


### Task
- Create a new column that categorises by price
    - Categories: cheap, pricey, expensive, super_expensive
     - Ranges: <500, 500-2500, 2500-10000, >10000

### Define function

In [11]:
def classify_by_price(price):
    if price < 500:
        return "cheap"
    elif 500 <= price < 2500:
        return "pricey"
    elif 2500 <= price < 10000:
        return "expensive"
    else:
        return "super_expensive"

### Establish a base line

#### Pure Python for loop
- **Runtime = 7170 ms**

In [15]:
%%timeit
class_list_loop = []
for i in range(len(df)):
    price = df.iloc[i]["price"]
    class_list_loop.append(classify_by_price(price))

df["price_cat_loop"] = class_list_loop

7.17 s ± 267 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pure Python for loop replace .iloc or .at
- **Runtime = 252 ms**

In [24]:
%%timeit
class_list_loop_at = []
for i in range(len(df)):
    price = df.at[i, "price"]
    class_list_loop_at.append(classify_by_price(price))

df["price_cat_loop_at"] = class_list_loop_at

252 ms ± 6.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Use iterrows()
- **Runtime = 1960 ms**

In [26]:
%%timeit
class_list_iterrow = []
for _, row in df.iterrows():
    price = row["price"]
    class_list_iterrow.append(classify_by_price(price))

df["price_cat_iterrow"] = class_list_iterrow

1.96 s ± 137 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Use itertuples()
- **Runtime = 77.6 ms**

In [31]:
%%timeit
class_list_itertuple = []
for tup in df.itertuples():
    price = tup.price
    class_list_itertuple.append(classify_by_price(price))

df["price_cat_itertuple"] = class_list_itertuple

77.6 ms ± 3.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Use apply()
- **Runtime = 386 ms**

In [36]:
%timeit df["price_cat_apply"] = df.apply(lambda row: classify_by_price(row["price"]), axis=1)

386 ms ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Use python list comprehension
- **Runtime= 14.2 ms**

In [39]:
%timeit df["price_cat_list_comp"] = [classify_by_price(x) for x in df["price"]]

14.2 ms ± 251 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Use pandas map function
- **Runtime = 14.4 ms**

In [44]:
%timeit df["price_cat_map"] = df["price"].map(classify_by_price)

14.4 ms ± 595 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_cat_loop,price_cat_loop_at,price_cat_iterrow,price_cat_itertuple,price_cat_apply,price_cat_list_comp,price_cat_map
10660,1.2,Very Good,I,SI1,63.1,58.0,4838,6.72,6.66,4.22,expensive,expensive,expensive,expensive,expensive,expensive,expensive
15188,1.21,Premium,H,SI2,58.8,60.0,6098,6.99,6.95,4.1,expensive,expensive,expensive,expensive,expensive,expensive,expensive
28531,0.33,Ideal,I,IF,62.0,54.0,675,4.46,4.48,2.77,pricey,pricey,pricey,pricey,pricey,pricey,pricey
18912,1.1,Premium,F,VS1,60.9,61.0,7762,6.7,6.64,4.06,expensive,expensive,expensive,expensive,expensive,expensive,expensive
9682,1.0,Ideal,I,SI1,61.5,57.0,4649,6.39,6.43,3.94,expensive,expensive,expensive,expensive,expensive,expensive,expensive
49582,0.71,Very Good,F,SI2,59.2,60.0,2134,5.85,5.91,3.48,pricey,pricey,pricey,pricey,pricey,pricey,pricey
52325,0.7,Good,E,SI1,63.2,57.0,2496,5.61,5.63,3.55,pricey,pricey,pricey,pricey,pricey,pricey,pricey
27690,0.38,Ideal,I,VS2,60.5,56.0,648,4.71,4.74,2.85,pricey,pricey,pricey,pricey,pricey,pricey,pricey
8905,0.9,Ideal,I,VS1,60.9,60.0,4492,6.2,6.24,3.79,expensive,expensive,expensive,expensive,expensive,expensive,expensive
52255,0.7,Good,G,VS2,64.3,56.0,2488,5.51,5.58,3.57,pricey,pricey,pricey,pricey,pricey,pricey,pricey
