In [55]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [67]:
df = pd.read_csv('data/fines.csv')

## calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

In [47]:
%%timeit
new_column = []
for i in range(len(df)):
    row = df.iloc[i]
    new_column.append(row['Fines'] / row['Refund'] * row['Year'])
df['Fines/Refund*Year'] = new_column

134 ms ± 3.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [48]:
%%timeit
new_column = []
for index, row in df.iterrows():
    new_column.append(row['Fines'] / row['Refund'] * row['Year'])
df['Fines/Refund*Year'] = new_column

80.8 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [49]:
%%timeit
df['Fines/Refund*Year'] = df.apply(lambda x: x['Fines'] / x['Refund'] * x['Year'], axis=1)

15.6 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [50]:
%%timeit
df['Fines/Refund*Year'] = df['Fines'] / df['Refund'] * df['Year']

464 µs ± 14.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [51]:
%%timeit
df['Fines/Refund*Year'] = df['Fines'].values / df['Refund'].values * df['Year'].values

185 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## indexing: measure the time using the magic command %%timeit in the cell

In [75]:
%%timeit
df.loc[df['CarNumber'] =='O136HO197RUS']

KeyError: 'CarNumber'

In [73]:
%%timeit
df.reset_index(inplace=True)
df.set_index('CarNumber', inplace=True)

835 µs ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [74]:
%%time
df.loc[df.index =='O136HO197RUS']

Wall time: 1e+03 µs


Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
O136HO197RUS,2,7800.0,Toyota,Corolla,1989


## downcasting

In [77]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, Y163O8161RUS to C410X938RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  925 non-null    int64  
 1   Fines   925 non-null    float64
 2   Make    925 non-null    object 
 3   Model   914 non-null    object 
 4   Year    925 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 195.4 KB


In [78]:
optimized = df.copy()

In [80]:
optimized['Fines'] = pd.to_numeric(optimized['Fines'], downcast='float')
optimized['Refund'] = pd.to_numeric(optimized['Refund'], downcast='integer')
optimized['Year'] = pd.to_numeric(optimized['Year'], downcast='integer')

In [81]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, Y163O8161RUS to C410X938RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  925 non-null    int8   
 1   Fines   925 non-null    float32
 2   Make    925 non-null    object 
 3   Model   914 non-null    object 
 4   Year    925 non-null    int16  
dtypes: float32(1), int16(1), int8(1), object(2)
memory usage: 180.0 KB


## categories

In [82]:
optimized['Make'] = optimized['Make'].astype('category')
optimized['Model'] = optimized['Model'].astype('category')

In [83]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, Y163O8161RUS to C410X938RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Refund  925 non-null    int8    
 1   Fines   925 non-null    float32 
 2   Make    925 non-null    category
 3   Model   914 non-null    category
 4   Year    925 non-null    int16   
dtypes: category(2), float32(1), int16(1), int8(1)
memory usage: 71.9 KB


## memory clean

In [85]:
%reset_selective -f df