This notebook presents my interactive notes about pandas nicely formatted, organised and translated to english.

In [1]:
import os

In [2]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'Exploratory Data Analysis.ipynb',
 'kaggle',
 'Pandas.ipynb',
 'Pandas2.ipynb',
 'README.md',
 'StrataScratch Interview Questions.ipynb']

In [3]:
os.listdir('kaggle')

['olist_customers_dataset.csv',
 'olist_geolocation_dataset.csv',
 'olist_orders_dataset.csv',
 'olist_order_items_dataset.csv',
 'olist_order_payments_dataset.csv',
 'olist_order_reviews_dataset.csv',
 'olist_products_dataset.csv',
 'olist_sellers_dataset.csv',
 'product_category_name_translation.csv']

In [13]:
import pandas as pd
import gc

In [47]:
products = pd.read_csv('kaggle\olist_products_dataset.csv')

## Overall tips

using inplace=True is no faster tha using regular df = df.(...).

inplace=False should be used for live debugging, to see what changes our code made to data. (If size is significant all code should be developed and dested on smaller samples)

In [66]:
# use copy to not get reference, changing reference will change the data it is referencing to
indexed = products.copy()
indexed.head(3)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0


In [67]:
indexed = indexed.set_index(['product_id']).head(3)
# indexed.set_index(['product_id'], inplace=True)
indexed.head(3)

Unnamed: 0_level_0,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0


## .loc .iloc

It is better to use .loc[], or .iloc[] than regular []. In python language it is better to be explicit and do not leave room for assumptions.

.loc[] if for labels (also for index labels), .iloc[] for indexes.

In [71]:
# .loc/iloc[rows(index), columns]

In [62]:
products.head(3)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0


In [68]:
products.loc[0:10, :].head(3)   # works because index labels are indexes

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0


In [70]:
# indexed.loc[0:10, :].head(3)  # doesn't work
indexed.iloc[0:10, :].head(3)  # works because selects based on index index, not on index labels

Unnamed: 0_level_0,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0


In [75]:
# selects all rows and columns except the last one,
# usefull when the last column is target and we want to divide data to X and y
products.iloc[:, :-1].shape

(32951, 8)

In [86]:
# as arguments in can take anything, even a lambda function that returns series/ datafrane of rows/ columns to select
index_series = products['product_weight_g']<250 
products.loc[index_series, ['product_id', 'product_weight_g']].head()

Unnamed: 0,product_id,product_weight_g
0,1e9e8ef04dbcff4541ed26657ea517e5,225.0
2,96bd76ec8810374ed1b65e291975717f,154.0
5,41d3672d4792049fa1779bb35283ed13,200.0
14,eb31436580a610f202c859463d8c7415,200.0
22,e3e020af31d4d89d2602272b315c3f6e,75.0


In [87]:
# ix allows to select by index and by label, but when selecting by index it is inclusive, in contrast to iloc.

## Multiindex

In [93]:
products.product_weight_g.describe()

count    32949.000000
mean      2276.472488
std       4282.038731
min          0.000000
25%        300.000000
50%        700.000000
75%       1900.000000
max      40425.000000
Name: product_weight_g, dtype: float64

In [94]:
def categorise_weight(row):  
    if row['product_weight_g'] <= 500:
        return 'light'
    elif row['product_weight_g'] > 500 and row['product_weight_g'] <= 1900 :
        return 'medium'
    elif row['product_weight_g'] > 1900:
        return 'heavy'

In [98]:
products['weight_category'] = products.apply(lambda row: categorise_weight(row), axis=1)

In [110]:
# temp = products.set_index(['product_category_name', 'weight_category'])
temp = products.set_index(['product_category_name', 'weight_category']).sort_index()
temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
product_category_name,weight_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
agro_industria_e_comercio,heavy,07f01b6fcacc1b187a71e5074199db2d,39.0,430.0,1.0,16400.0,63.0,66.0,56.0
agro_industria_e_comercio,heavy,51d1db0b0ed8fee9994a5bae549a6160,41.0,288.0,1.0,5047.0,58.0,44.0,28.0
agro_industria_e_comercio,heavy,0b2a1288e8ba64c797e7586c8df75602,38.0,397.0,1.0,3450.0,33.0,36.0,33.0
agro_industria_e_comercio,heavy,3e3f442db862cb6fe99389a41b7acb84,60.0,638.0,6.0,12800.0,67.0,57.0,30.0
agro_industria_e_comercio,heavy,423a6644f0aa529e8828ff1f91003690,54.0,2352.0,3.0,5800.0,30.0,30.0,20.0


Additional index adds additional dimensionality to data. Series with double index can be thought of as 2d dataframe.

To locate multi index using .loc[] we pass it in tuple()

In [116]:
# !To do - read about this warning
temp.index.is_monotonic_increasing

False

In [111]:
temp.loc[('perfumaria'), :]  # perfumeria from first index and any value from second
temp.loc[('perfumaria', 'medium'), :]  # perfumeria from first index and medium from second


  temp.loc[('perfumaria', 'medium'), :]  # perfumeria from first index and medium from second


Unnamed: 0_level_0,Unnamed: 1_level_0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
product_category_name,weight_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
perfumaria,medium,828fe032935d7c1901682e5b6cc076c2,48.0,1063.0,2.0,600.0,20.0,20.0,20.0
perfumaria,medium,dc582e9ac5036846acfeeb3093b17aa7,50.0,749.0,1.0,1300.0,22.0,14.0,14.0
perfumaria,medium,3488d2ce36e718097c1509444289ef7f,40.0,1176.0,1.0,514.0,18.0,12.0,15.0
perfumaria,medium,ff7ac89ca5b77d0fb5f8a65262d73956,46.0,483.0,1.0,1000.0,20.0,14.0,13.0
perfumaria,medium,c51dc74a8b5018e82e9e44ef4b647227,60.0,418.0,1.0,650.0,16.0,10.0,11.0
perfumaria,...,...,...,...,...,...,...,...,...
perfumaria,medium,278f4a3ec9c377c2ed98f3256834690c,23.0,386.0,1.0,550.0,20.0,16.0,16.0
perfumaria,medium,6186a25dd2b885087bbf0863e5245d46,30.0,253.0,1.0,900.0,35.0,5.0,27.0
perfumaria,medium,8b13342c7e5cfac764027523312cdf74,41.0,225.0,1.0,900.0,16.0,16.0,11.0
perfumaria,medium,33bfc11487d18d2405f84661bc164f0f,39.0,387.0,1.0,545.0,21.0,14.0,17.0


##