In [127]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [128]:
fines = pd.read_csv('../ex04/fines.csv')
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.000,3200.0,Ford,Focus,2018
1,E432XX77RUS,1.000,6500.0,Toyota,Camry,2008
2,7184TT36RUS,1.000,2100.0,Ford,Focus,1994
3,X582HE161RUS,2.000,2000.0,Ford,Focus,1987
4,92918M178RUS,1.000,5700.0,Ford,Focus,2000
...,...,...,...,...,...,...
925,a121345raaa,2.000,12345.0,Harry,Potter,1000
926,33XJudas,3.140,1337.0,Jesus,Christ,0
927,BigChungus42,69.000,420.0,Shrek,Swamp,2000
928,Poland,4.200,123.0,BOBR,Kurva,2007


## iterations: in all the following subtasks, you need to calculate `fines/refund*year` for each row and create a new column with the calculated data and measure the time using the magic command `%%timeit` in the cell


### loop: write a function that iterates through the dataframe using `for i in range(0, len(df))`, `iloc` and `append()` to a list, assign the result of the function to a new column in the dataframe


In [129]:
def loop_calc(df):
  calc = []
  for i in range(0, len(df)):
    calc.append(df.iloc[i]['Fines'] / df.iloc[i]
                ['Refund'] * df.iloc[i]['Year'])
  return calc

In [130]:
%%timeit 
fines['CalcLoop'] = loop_calc(fines)

79.6 ms ± 707 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### do it using `iterrows()`

In [131]:
def calc_iter(df):
  return [row['Fines'] / row['Refund'] * row['Year'] for _, row in df.iterrows()]

In [132]:
%%timeit
fines['CalcInter']  = calc_iter(fines)

27.3 ms ± 175 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### do it using `apply()` and lambda function

In [133]:
def calc_apply(df):
  return df.apply(
      lambda x:   x['Fines'] / x['Refund'] * x['Year'],
      axis=1,
  )

In [134]:
%%timeit
fines['CalcApply'] = calc_apply(fines)

6.73 ms ± 77.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### do it using `Series` objects from the dataframe

In [135]:
def calc_series(df):
  return df['Fines'] / df['Refund'] * df['Year']

In [136]:
%%timeit
fines['CalcSeries'] = calc_series(fines)

148 μs ± 2.84 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### do it as in the previous subtask but with the method `.values`

In [137]:
def calc_series_values(df):
  return df['Fines'].values / df['Refund'].values * df['Year'].values

In [138]:
%%timeit
fines['CalcSeriesValues'] = calc_series_values(fines)

70.3 μs ± 1.42 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### final df

In [139]:
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,CalcLoop,CalcInter,CalcApply,CalcSeries,CalcSeriesValues
0,Y163O8161RUS,2.000,3200.0,Ford,Focus,2018,3.228800e+06,3.228800e+06,3.228800e+06,3.228800e+06,3.228800e+06
1,E432XX77RUS,1.000,6500.0,Toyota,Camry,2008,1.305200e+07,1.305200e+07,1.305200e+07,1.305200e+07,1.305200e+07
2,7184TT36RUS,1.000,2100.0,Ford,Focus,1994,4.187400e+06,4.187400e+06,4.187400e+06,4.187400e+06,4.187400e+06
3,X582HE161RUS,2.000,2000.0,Ford,Focus,1987,1.987000e+06,1.987000e+06,1.987000e+06,1.987000e+06,1.987000e+06
4,92918M178RUS,1.000,5700.0,Ford,Focus,2000,1.140000e+07,1.140000e+07,1.140000e+07,1.140000e+07,1.140000e+07
...,...,...,...,...,...,...,...,...,...,...,...
925,a121345raaa,2.000,12345.0,Harry,Potter,1000,6.172500e+06,6.172500e+06,6.172500e+06,6.172500e+06,6.172500e+06
926,33XJudas,3.140,1337.0,Jesus,Christ,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
927,BigChungus42,69.000,420.0,Shrek,Swamp,2000,1.217391e+04,1.217391e+04,1.217391e+04,1.217391e+04,1.217391e+04
928,Poland,4.200,123.0,BOBR,Kurva,2007,5.877643e+04,5.877643e+04,5.877643e+04,5.877643e+04,5.877643e+04


## indexing: measure the time using the magic command `%%timeit` in the cell

### get a row for a specific `CarNumber`, for example, `’O136HO197RUS’`

In [140]:
%%timeit
filt = fines['CarNumber'] == 'O136HO197RUS'
fines[filt]

256 μs ± 10.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### set the index in your dataframe with CarNumber

In [141]:
fines.set_index('CarNumber', inplace=True)

### again, get a row for the same `CarNumber`

In [142]:
%%timeit
fines.loc['O136HO197RUS']

43.1 μs ± 1.11 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## downcasting:

### run `df.info(memory_usage=’deep’)`, pay attention to the `Dtype` and the memory usage

In [143]:
fines.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to DogeCoinToTheMoon
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Refund            930 non-null    float64
 1   Fines             930 non-null    float64
 2   Make              930 non-null    object 
 3   Model             919 non-null    object 
 4   Year              930 non-null    int64  
 5   CalcLoop          930 non-null    float64
 6   CalcInter         930 non-null    float64
 7   CalcApply         930 non-null    float64
 8   CalcSeries        930 non-null    float64
 9   CalcSeriesValues  930 non-null    float64
dtypes: float64(7), int64(1), object(2)
memory usage: 243.4 KB


### make a `copy()` of your initial dataframe into another dataframe `optimized`

In [144]:
optimized = fines.copy()
optimized

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,CalcLoop,CalcInter,CalcApply,CalcSeries,CalcSeriesValues
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Y163O8161RUS,2.000,3200.0,Ford,Focus,2018,3.228800e+06,3.228800e+06,3.228800e+06,3.228800e+06,3.228800e+06
E432XX77RUS,1.000,6500.0,Toyota,Camry,2008,1.305200e+07,1.305200e+07,1.305200e+07,1.305200e+07,1.305200e+07
7184TT36RUS,1.000,2100.0,Ford,Focus,1994,4.187400e+06,4.187400e+06,4.187400e+06,4.187400e+06,4.187400e+06
X582HE161RUS,2.000,2000.0,Ford,Focus,1987,1.987000e+06,1.987000e+06,1.987000e+06,1.987000e+06,1.987000e+06
92918M178RUS,1.000,5700.0,Ford,Focus,2000,1.140000e+07,1.140000e+07,1.140000e+07,1.140000e+07,1.140000e+07
...,...,...,...,...,...,...,...,...,...,...
a121345raaa,2.000,12345.0,Harry,Potter,1000,6.172500e+06,6.172500e+06,6.172500e+06,6.172500e+06,6.172500e+06
33XJudas,3.140,1337.0,Jesus,Christ,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
BigChungus42,69.000,420.0,Shrek,Swamp,2000,1.217391e+04,1.217391e+04,1.217391e+04,1.217391e+04,1.217391e+04
Poland,4.200,123.0,BOBR,Kurva,2007,5.877643e+04,5.877643e+04,5.877643e+04,5.877643e+04,5.877643e+04


### downcast from `float64` to `float32` for all the columns

In [145]:
optimized = optimized.apply(lambda x:
                            pd.to_numeric(
                                x,
                                downcast='float',

                            ) if pd.api.types.is_float_dtype(x) else x,
                            )

### downcast from `int64` to the smallest numerical dtype possible

In [146]:
optimized = optimized.apply(lambda x:
                            pd.to_numeric(
                                x,
                                downcast='integer',

                            ) if pd.api.types.is_integer_dtype(x) else x,
                            )

### run `info(memory_usage='deep')` for your new dataframe, pay attention to the Dtype and the memory usage

In [147]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to DogeCoinToTheMoon
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Refund            930 non-null    float32
 1   Fines             930 non-null    float32
 2   Make              930 non-null    object 
 3   Model             919 non-null    object 
 4   Year              930 non-null    int16  
 5   CalcLoop          930 non-null    float64
 6   CalcInter         930 non-null    float64
 7   CalcApply         930 non-null    float64
 8   CalcSeries        930 non-null    float64
 9   CalcSeriesValues  930 non-null    float64
dtypes: float32(2), float64(5), int16(1), object(2)
memory usage: 230.7 KB


## categories:

### change the `object` type columns to the type `category`!

In [148]:
optimized = optimized.astype(
    {
        col: 'category'
        for col
        in optimized.select_dtypes(include='object').columns
    }
)

### This time, check the memory usage, it probably has a decrease of 2–3 times compared to the initial dataframe

In [149]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to DogeCoinToTheMoon
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Refund            930 non-null    float32 
 1   Fines             930 non-null    float32 
 2   Make              930 non-null    category
 3   Model             919 non-null    category
 4   Year              930 non-null    int16   
 5   CalcLoop          930 non-null    float64 
 6   CalcInter         930 non-null    float64 
 7   CalcApply         930 non-null    float64 
 8   CalcSeries        930 non-null    float64 
 9   CalcSeriesValues  930 non-null    float64 
dtypes: category(2), float32(2), float64(5), int16(1)
memory usage: 137.0 KB


## memory clean

### using `%reset_selective` and the library `gc` clean the memory of your initial dataframe only

In [150]:
%reset_selective fines

In [151]:
fines

NameError: name 'fines' is not defined