In [37]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [38]:
df = pd.read_csv("../data/fines.csv")
df.tail()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
925,R078TX178RUS,1,2345.0,McLaren,P1,1999
926,C718MC178RUS,2,5678.0,BMW,M760i,2021
927,K361KA178RUS,3,8901.0,Lotus,Esprit,1987
928,O432AB178RUS,4,1234.0,Porsche,911 Targa,2024
929,X023HA178RUS,5,2161.0,Volvo,S90,2017


## iterations: in all the following subtasks, you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell
- loop: write a function that iterates through the dataframe using for i in range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe

In [39]:
def iterations_loop(df):
    res = []
    for i in range(len(df)):
        res.append(df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year'])
    df['Calculations'] = res

In [40]:
%%timeit

iterations(df)

35.1 ms ± 212 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


- do it using iterrows()

In [41]:
def iterations_iterrows(df):
    res = []
    for row in df.iterrows():
        res.append(df.iloc[1]['Fines'] / df.iloc[1]['Refund'] * df.iloc[1]['Year'])
    df['Calculations'] = res

In [42]:
%%timeit

iterations_iterrows(df)

45.7 ms ± 103 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


- do it using apply() and lambda function

In [43]:
def iterations_lambda(df):
    df['Calculations'] = df.apply(lambda x: x['Fines'] / x['Refund'] * x['Year'], axis='columns')

In [44]:
%%timeit

iterations_lambda(df)

3.5 ms ± 14.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


- do it using Series objects from the dataframe

In [45]:
def iterations_series(df):
    df['Calculations'] = df['Fines'] / df ['Refund'] * df['Year']

In [46]:
%%timeit

iterations_series(df)

60.7 μs ± 2.79 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


- do it as in the previous subtask but with the method .values

In [47]:
def iterations_values(df):
    df['Calculations'] = df['Fines'].values / df ['Refund'].values * df['Year'].values

In [48]:
%%timeit

iterations_values(df)

32.8 μs ± 1.97 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## indexing: measure the time using the magic command %%timeit in the cell
- get a row for a specific CarNumber, for example, ’O136HO197RUS’

- set the index in your dataframe with CarNumber

- again, get a row for the same CarNumber

In [49]:
%%timeit

df[df['CarNumber'] == 'O136HO197RUS']

97.4 μs ± 1.88 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [50]:
df.set_index('CarNumber', inplace=True)
df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0
E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0
7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0
X582HE161RUS,2,2000.0,Ford,Focus,2015,2015000.0
92918M178RUS,1,5700.0,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...
R078TX178RUS,1,2345.0,McLaren,P1,1999,4687655.0
C718MC178RUS,2,5678.0,BMW,M760i,2021,5737619.0
K361KA178RUS,3,8901.0,Lotus,Esprit,1987,5895429.0
O432AB178RUS,4,1234.0,Porsche,911 Targa,2024,624404.0


In [51]:
%%timeit

df[df.index == 'O136HO197RUS']

77.8 μs ± 3.31 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [52]:
%%timeit

df.loc['O136HO197RUS']

24.4 μs ± 594 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## downcasting:
- run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

- make a copy() of your initial dataframe into another dataframe optimized

- downcast from float64 to float32 for all the columns

- downcast from int64 to the smallest numerical dtype possible

- run info(memory_usage='deep') for your new dataframe, pay attention to the Dtype and the memory usage

In [53]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to X023HA178RUS
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Refund        930 non-null    int64  
 1   Fines         930 non-null    float64
 2   Make          930 non-null    object 
 3   Model         919 non-null    object 
 4   Year          930 non-null    int64  
 5   Calculations  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 214.3 KB


In [54]:
copy = df.copy()

In [55]:
fcols = copy.select_dtypes(include='float64').columns
icols = copy.select_dtypes(include='int64').columns

copy[fcols] = copy[fcols].apply(lambda x: x.astype('float32'))
copy[icols] = copy[icols].apply(pd.to_numeric, downcast='integer')

In [57]:
copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to X023HA178RUS
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Refund        930 non-null    int8   
 1   Fines         930 non-null    float32
 2   Make          930 non-null    object 
 3   Model         919 non-null    object 
 4   Year          930 non-null    int16  
 5   Calculations  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 195.2 KB


## categories:
- change the object type columns to the type category

- This time, check the memory usage, it probably has a decrease of 2–3 times compared to the initial dataframe

In [58]:
obj_cols = copy.select_dtypes(include=['object']).columns

copy[obj_cols] = copy[obj_cols].astype('category')

In [60]:
copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to X023HA178RUS
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Refund        930 non-null    int8    
 1   Fines         930 non-null    float32 
 2   Make          930 non-null    category
 3   Model         919 non-null    category
 4   Year          930 non-null    int16   
 5   Calculations  930 non-null    float32 
dtypes: category(2), float32(2), int16(1), int8(1)
memory usage: 101.5 KB


## memory clean
- using %reset_selective and the library gc clean the memory of your initial dataframe only

In [61]:
%reset_selective df

gc.collect()

284