In [217]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [218]:
dataset_link = "https://drive.google.com/file/d/1_UnNvF6WrvTj0Zl-gJj0YL1xv-Riin3K/view?usp=sharing"
file_id = dataset_link.split("/")[-2]
!gdown {file_id}
!ls

Downloading...
From: https://drive.google.com/uc?id=1_UnNvF6WrvTj0Zl-gJj0YL1xv-Riin3K
To: /Users/dhawkgir/ds/day05/ex05/fines.csv
100%|██████████████████████████████████████| 38.7k/38.7k [00:00<00:00, 32.5MB/s]
fines.csv           optimizations.ipynb


In [219]:
file_name = 'fines.csv'
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## iterations

In [220]:
def loop_iloc(df: pd.DataFrame) -> list:
    tmp_list = []
    for i in range(0, len(df)):
        tmp_list.append(df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year'])
    return tmp_list

In [221]:
def loop_iterrows(df: pd.DataFrame) -> list:
    tmp_list = []
    for i in df.iterrows():
        # print(i[1]['Fines'])
        tmp_list.append(i[1]['Fines'] / i[1]['Refund'] * i[1]['Year'])
    return tmp_list

In [222]:
%%timeit
df['new_column'] = loop_iloc(df)

237 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [223]:
%%timeit
df['new_column1'] = loop_iterrows(df)

35.7 ms ± 87.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [224]:
%%timeit
df['new_lambda'] = df.apply(lambda x: x.Fines / x.Refund * x.Year, axis=1)

19.3 ms ± 95.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [225]:
%%timeit
df['new_Series'] = df.Fines / df.Refund * df.Year

242 µs ± 1.35 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [226]:
%%timeit
df['new_values'] = df.Fines.values / df.Refund.values * df.Year.values

111 µs ± 274 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [227]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,new_column,new_column1,new_lambda,new_Series,new_values
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0,3182400.0,3182400.0,3182400.0,3182400.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0,12967500.0,12967500.0,12967500.0,12967500.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0,4166400.0,4166400.0,4166400.0,4166400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,2015,2015000.0,2015000.0,2015000.0,2015000.0,2015000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2014,11479800.0,11479800.0,11479800.0,11479800.0,11479800.0
...,...,...,...,...,...,...,...,...,...,...,...
925,X666XX666EU,3,4521.0,Kia,Ceed,2020,3044140.0,3044140.0,3044140.0,3044140.0,3044140.0
926,X222XX531EU,2,5321.0,Kia,Rio,2021,5376870.5,5376870.5,5376870.5,5376870.5,5376870.5
927,X111XX327EU,1,6321.0,BMW,X4,2022,12781062.0,12781062.0,12781062.0,12781062.0,12781062.0
928,X333XX36EU,1,321.0,BMW,X3,2009,644889.0,644889.0,644889.0,644889.0,644889.0


## indexing

In [228]:
number = df['CarNumber'].iloc[42]

In [229]:
%%timeit
df.loc[df.CarNumber == number]

265 µs ± 932 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [230]:
df_number = df.set_index('CarNumber')

In [231]:
%%timeit
df_number.loc[number]

126 µs ± 295 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## downcasting

In [262]:
df = pd.read_csv(file_name)

In [263]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    int64  
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 196.6 KB


In [264]:
df_copy = df.copy()

In [265]:
df_copy.Fines = pd.to_numeric(df_copy.Fines, downcast='float')
df_copy.Refund = pd.to_numeric(df_copy.Refund, downcast='integer')
df_copy.Year = pd.to_numeric(df_copy.Year, downcast='integer')

In [266]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    int8   
 2   Fines      930 non-null    float32
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int16  
dtypes: float32(1), int16(1), int8(1), object(3)
memory usage: 181.1 KB


## categories

In [267]:
df_copy.CarNumber = df_copy.CarNumber.astype("category")
df_copy.Make = df_copy.Make.astype("category")
df_copy.Model = df_copy.Model.astype("category")

In [268]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    int8    
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
dtypes: category(3), float32(1), int16(1), int8(1)
memory usage: 64.5 KB


## memory clean

In [271]:
%reset_selective -f ^df$

In [272]:
df_copy

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
925,X666XX666EU,3,4521.0,Kia,Ceed,2020
926,X222XX531EU,2,5321.0,Kia,Rio,2021
927,X111XX327EU,1,6321.0,BMW,X4,2022
928,X333XX36EU,1,321.0,BMW,X3,2009


In [273]:
df

NameError: name 'df' is not defined

In [276]:
%who_ls

['DataFrame',
 'MultiIndex',
 'calc',
 'dataset_link',
 'df_copy',
 'file_id',
 'file_name',
 'gc',
 'i',
 'loop',
 'loop_iloc',
 'loop_iterrows',
 'number',
 'pd',
 'print_columns',
 'pydev_jupyter_vars',
 'remove_imported_pydev_package',
 'sys',
 'tmp_list']