# Ex 05 Pandas optimization

In [1]:
import pandas as pd

## load csv

In [47]:
data = pd.read_csv('data/fines.csv')
data

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
925,Y200B9122RUS,2,5400.5,BMW,M5,1990
926,X200B9124RUS,1,2450.5,Audi,A4,2000
927,D900B9152RUS,2,9200.5,Ford,Focus,2007
928,K220B9322RUS,1,2790.5,BMW,M5,1993


## iterations

In [24]:
def calculate_fines(fines, refund, year):
    return fines * refund / year

In [25]:
def calculate_fines_for(data):
    results = []
    for i in range (len(data)):
        fine = data.iloc[i]['Fines']
        refund = data.iloc[i]['Refund']
        year = data.iloc[i]['Year']
        result = calculate_fines(fine,refund,year)
        results.append(result)
    data['Result'] = results
    return data

In [26]:
%%timeit
calculate_fines_for(data)

70.4 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
def calculate_fines_iterrows(data):
    results = []
    for i, row in data.iterrows():
        fine = row['Fines']
        refund = row['Refund']
        year = row['Year']
        result = calculate_fines(fine,refund,year)
        results.append(result)
    data['Result'] = results
    return data

In [28]:
%%timeit
calculate_fines_iterrows(data)

24.1 ms ± 481 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%%timeit
data['Result'] = data.apply(lambda x: calculate_fines(x['Fines'], x['Refund'], x['Year']), axis = 1)

5.22 ms ± 123 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
%%timeit
data['Result'] = calculate_fines(data['Fines'], data['Refund'], data['Year'])

136 μs ± 2.11 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [31]:
%%timeit
data['Result'] = calculate_fines(data['Fines'].values, 
                                data['Refund'].values, 
                                data['Year'].values)

62.6 μs ± 1.61 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## index

In [48]:
%%timeit
data[data.CarNumber == 'X200B9124RUS']

190 μs ± 2.89 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [49]:
data = data.set_index('CarNumber')
data

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989
E432XX77RUS,1,6500.0,Toyota,Camry,1995
7184TT36RUS,1,2100.0,Ford,Focus,1984
X582HE161RUS,2,2000.0,Ford,Focus,2015
92918M178RUS,1,5700.0,Ford,Focus,2014
...,...,...,...,...,...
Y200B9122RUS,2,5400.5,BMW,M5,1990
X200B9124RUS,1,2450.5,Audi,A4,2000
D900B9152RUS,2,9200.5,Ford,Focus,2007
K220B9322RUS,1,2790.5,BMW,M5,1993


In [50]:
%%timeit
data[data.index == 'X200B9124RUS']

127 μs ± 2.16 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## downcasting

In [51]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to L321B9144RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  930 non-null    int64  
 1   Fines   930 non-null    float64
 2   Make    930 non-null    object 
 3   Model   919 non-null    object 
 4   Year    930 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 228.7 KB


In [52]:
optimized_df = data.copy()

In [57]:
optimized_df['Make'] = optimized_df['Make'].astype('category')
optimized_df['Model'] = optimized_df['Model'].astype('category')
optimized_df['Fines'] = optimized_df['Fines'].astype('int16')
optimized_df['Refund'] = optimized_df['Refund'].astype('float16')
optimized_df['Year'] = optimized_df['Year'].astype('int16')

In [58]:
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to L321B9144RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Refund  930 non-null    float16 
 1   Fines   930 non-null    int16   
 2   Make    930 non-null    category
 3   Model   919 non-null    category
 4   Year    930 non-null    int16   
dtypes: category(2), float16(1), int16(2)
memory usage: 103.8 KB


In [1]:
data

NameError: name 'data' is not defined