# **Pandas - Sorting, Filtering, Group By**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('zara_cleaned.csv')

In [None]:
df.head(3)

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
0,179801,2023-10-21,Front of Store,Man,Jackets,311307255-800-2,Contrasting Patches Bomber Jacket,Yes,No,169.0,2608,440752.0,2023
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023


## **Filtering Data & Sorting Data**

In [None]:
# This is a filtering mechanism on its own. It filters the df to show only the Price Column
df['Price']

Unnamed: 0,Price
0,169.0
1,169.0
2,69.9
3,109.0
4,59.9
...,...
249,89.9
250,89.9
251,89.9
252,139.0


##### Filter through **Numeric Condition**

In [None]:
# Filters the df to show records where the value of Price > 200
df[df['Price'] > 200]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
50,151925,2023-03-20,End-cap,Man,Jackets,320774184-800-97,Vintage Effect Leather Bomber Jacket,Yes,Yes,299.0,2179,651521.0,2023
108,151396,2024-03-17,Front of Store,Man,Jackets,323134418-800-3,Cropped Leather Jacket,Yes,Yes,439.0,729,320031.0,2024
121,161909,2023-10-04,Aisle,Man,Jackets,315529534-705-3,Leather Jacket,No,Yes,299.0,1290,385710.0,2023


In [None]:
# Compound condition
df[(df['Price'] > 100) & (df['Terms'] == 'T-Shirts')]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
3,171860,2023-01-04,Aisle,,T-Shirts,323646471-802-2,Padded Denim Jacket,No,Yes,109.0,707,77063.0,2023
42,172015,2023-08-14,End-cap,Man,T-Shirts,313008692-514-2,Faux Leather Bomber Jacket,Yes,Yes,139.0,2622,364458.0,2023
54,195103,2023-02-14,Aisle,Man,T-Shirts,330290387-401-2,Bouclã Textured Jacket,No,Yes,109.0,2849,310541.0,2023
75,118985,2023-05-12,Aisle,Man,T-Shirts,281593208-015-2,Patch Bomber Jacket,Yes,No,139.0,694,96466.0,2023
88,123047,2024-03-28,Aisle,Man,T-Shirts,313027279-800-2,Suit Jacket In 100% Linen,Yes,No,109.0,2985,325365.0,2024
144,144651,2024-01-08,Aisle,Man,T-Shirts,312571093-710-2,Denim Bomber Jacket,No,Yes,139.0,1791,248949.0,2024
192,121348,2023-11-21,End-cap,Man,T-Shirts,323216360-406-2,Faux Leather Boxy Fit Jacket,Yes,No,169.0,1008,170352.0,2023
238,123077,2023-02-14,Front of Store,Man,T-Shirts,320671529-802-2,Jacquard Denim Jacket,No,No,109.0,1603,174727.0,2023


In [None]:
# & condition goes first before |.
# Use () to tell python which operation to prioritize
df[(df['Price'] > 100) & (df['Terms'] == 'T-Shirts')|(df['Terms'] == 'Jeans')]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
3,171860,2023-01-04,Aisle,,T-Shirts,323646471-802-2,Padded Denim Jacket,No,Yes,109.0,707,77063.0,2023
15,171776,2023-07-09,Aisle,Man,Jeans,275600408-400-2,Denim Shirt,No,Yes,12.99,1685,21888.15,2023
18,175584,2023-02-22,Front of Store,Man,Jeans,324908098-800-38,Baggy Fit Jeans Limited Edition,No,No,69.9,2084,145671.6,2023
42,172015,2023-08-14,End-cap,Man,T-Shirts,313008692-514-2,Faux Leather Bomber Jacket,Yes,Yes,139.0,2622,364458.0,2023
54,195103,2023-02-14,Aisle,Man,T-Shirts,330290387-401-2,Bouclã Textured Jacket,No,Yes,109.0,2849,310541.0,2023
62,147491,2023-11-29,End-cap,Man,Jeans,326540983-802-36,Baggy Belted Jeans,No,No,59.9,1563,93623.7,2023
75,118985,2023-05-12,Aisle,Man,T-Shirts,281593208-015-2,Patch Bomber Jacket,Yes,No,139.0,694,96466.0,2023
88,123047,2024-03-28,Aisle,Man,T-Shirts,313027279-800-2,Suit Jacket In 100% Linen,Yes,No,109.0,2985,325365.0,2024
89,198283,2022-12-31,End-cap,Man,Jeans,315836999-406-34,Baggy Fit Jeans,Yes,Yes,69.9,1558,108904.2,2022
98,156342,2024-03-06,Front of Store,Man,Jeans,311287318-400-38,Flared Fit Cargo Jeans,Yes,Yes,109.0,1466,159794.0,2024


##### Filter through **Date Condition**

In [None]:
# Filter df to show products released in the previous years
df[df['Release'] < '2024-01-06']

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
0,179801,2023-10-21,Front of Store,Man,Jackets,311307255-800-2,Contrasting Patches Bomber Jacket,Yes,No,169.0,2608,440752.0,2023
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023
3,171860,2023-01-04,Aisle,,T-Shirts,323646471-802-2,Padded Denim Jacket,No,Yes,109.0,707,77063.0,2023
4,166331,2023-07-09,Aisle,Woman,Sweaters,324186867-642-2,Asymmetrical Wool And Silk Blend Sweater,Yes,Yes,59.9,2877,172332.3,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,189349,2022-12-23,End-cap,Man,Jackets,317107014-707-2,Textured Weave Overshirt,No,Yes,89.9,1245,111925.5,2022
247,120228,2023-01-28,End-cap,Man,Jackets,327116625-505-2,100% Feather Fill Puffer Jacket,No,No,89.9,2347,210995.3,2023
248,194339,2023-05-07,End-cap,Man,Jackets,311302863-800-2,Faux Suede Jacket,No,No,169.0,1792,302848.0,2023
249,125409,2023-04-08,End-cap,Man,Jackets,330290360-427-3,Pocket Denim Jacket,No,Yes,89.9,2040,183396.0,2023


In [None]:
# Filter df to show products released within a date range
df[(df['Release'] < '2024-01-01') & (df['Release'] > '2022-12-30')]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
0,179801,2023-10-21,Front of Store,Man,Jackets,311307255-800-2,Contrasting Patches Bomber Jacket,Yes,No,169.0,2608,440752.0,2023
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023
3,171860,2023-01-04,Aisle,,T-Shirts,323646471-802-2,Padded Denim Jacket,No,Yes,109.0,707,77063.0,2023
4,166331,2023-07-09,Aisle,Woman,Sweaters,324186867-642-2,Asymmetrical Wool And Silk Blend Sweater,Yes,Yes,59.9,2877,172332.3,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,126806,2023-04-04,End-cap,Man,Jackets,330290387-401-2,Pocket Overshirt,Yes,Yes,109.0,2404,262036.0,2023
247,120228,2023-01-28,End-cap,Man,Jackets,327116625-505-2,100% Feather Fill Puffer Jacket,No,No,89.9,2347,210995.3,2023
248,194339,2023-05-07,End-cap,Man,Jackets,311302863-800-2,Faux Suede Jacket,No,No,169.0,1792,302848.0,2023
249,125409,2023-04-08,End-cap,Man,Jackets,330290360-427-3,Pocket Denim Jacket,No,Yes,89.9,2040,183396.0,2023


##### Filter through **String Condition**

In [None]:
# Returns data where value for Name starts with 'A'
df[df['Name'].str[0]=='A']

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
4,166331,2023-07-09,Aisle,Woman,Sweaters,324186867-642-2,Asymmetrical Wool And Silk Blend Sweater,Yes,Yes,59.9,2877,172332.3,2023
9,174708,2023-03-05,Front of Store,Man,Sweaters,317889153-700-3,Abstract Jacquard Sweater,Yes,No,59.9,529,31687.1,2023
17,115153,2024-02-28,Aisle,Woman,Sweaters,313890650-712-2,Alpaca Blend Open Knit Sweater,No,No,79.9,1736,138706.4,2024
96,160763,2023-12-19,Front of Store,Man,Shoes,311282212-800-39,Adherent Stripes Sneakers,Yes,Yes,45.9,991,45486.9,2023
101,188909,2023-07-18,Aisle,Woman,Sweaters,336449731-712-1,Asymmetric Cropped Knit Sweater,Yes,Yes,39.9,2863,114233.7,2023
124,130321,2022-12-18,End-cap,Man,T-Shirts,320071210-802-2,Abstract Print Knit T-Shirt,No,Yes,39.9,2093,83510.7,2022
139,118569,2023-01-07,Aisle,Woman,Sweaters,313890645-330-2,Alpaca And Wool Blend Tie Dye Knit Sweater,Yes,No,49.9,994,49600.6,2023
215,185154,2024-01-06,Aisle,Man,T-Shirts,311307261-712-2,Abstract Print T-Shirt,No,Yes,39.9,996,39740.4,2024
250,190238,2024-03-08,Front of Store,Man,Jackets,328279967-409-3,Acid Wash Denim Jacket,Yes,No,89.9,1917,172338.3,2024


In [None]:
# Returns data where the value of Name contains 'Suit'
df[df['Name'].str.contains('Suit', regex=False, na=False)]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023
37,182157,2023-06-07,Aisle,Man,Jackets,322972485-431-46,Suit Jacket In 100% Linen,No,No,159.0,2421,384939.0,2023
73,180176,2023-01-13,End-cap,Man,Jackets,335342680-800-44,Slim Fit Suit Jacket,Yes,Yes,129.0,2220,286380.0,2023
88,123047,2024-03-28,Aisle,Man,T-Shirts,313027279-800-2,Suit Jacket In 100% Linen,Yes,No,109.0,2985,325365.0,2024
111,141861,2023-07-15,Aisle,Man,Jackets,313854165-401-46,100% Wool Suit Jacket,Yes,Yes,169.0,1916,323804.0,2023
152,127478,2023-01-23,Front of Store,Man,Jackets,328594167-800-46,Straight Suit Jacket,Yes,No,129.0,2498,322242.0,2023
153,123824,2023-10-14,Aisle,Man,Jackets,322901350-800-46,Wool Blend Suit Jacket,No,Yes,169.0,2277,384813.0,2023
187,194410,2023-02-07,End-cap,Man,Jackets,322972473-052-46,Suit Jacket In 100% Linen,No,No,159.0,669,106371.0,2023
188,167981,2023-05-28,End-cap,Man,Jackets,329706743-401-46,Houndstooth Suit Jacket,No,No,139.0,730,101470.0,2023


In [None]:
# Returns data where the value of Name is 'Suit Jacket'
df[df['Name']=='Suit Jacket']

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023


In [None]:
# Returns data where the value of Name is either 'Suit Jacket' or 'Slim Fit Suit Jacket'
df[(df['Name']=='Suit Jacket') | (df['Name']=='Slim Fit Suit Jacket')]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023
73,180176,2023-01-13,End-cap,Man,Jackets,335342680-800-44,Slim Fit Suit Jacket,Yes,Yes,129.0,2220,286380.0,2023
225,138779,2023-05-15,Front of Store,Man,Jackets,329300083-705-48,Slim Fit Suit Jacket,Yes,Yes,139.0,2801,389339.0,2023


##### **`.isin()`**

In [None]:
# Returns data where the value of Name is in the list
df[df['Name'].isin(['Suit Jacket', 'Slim Fit Suit Jacket'])]

Unnamed: 0,Product ID,Release,Product Position,Section,Terms,SKU,Name,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
1,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,Suit Jacket,No,Yes,169.0,1524,257556.0,2023
2,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Suit Jacket,Yes,Yes,69.9,1764,123303.6,2023
73,180176,2023-01-13,End-cap,Man,Jackets,335342680-800-44,Slim Fit Suit Jacket,Yes,Yes,129.0,2220,286380.0,2023
225,138779,2023-05-15,Front of Store,Man,Jackets,329300083-705-48,Slim Fit Suit Jacket,Yes,Yes,139.0,2801,389339.0,2023


##### **`.set_index()`**

In [None]:
df.set_index('Name', inplace=True)

##### **`.filter()`**

In [None]:
# Show all data with only the given columns
df.filter(items=['Total Sales', 'Promotion'], axis=1)

Unnamed: 0_level_0,Total Sales,Promotion
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Contrasting Patches Bomber Jacket,440752.0,No
Suit Jacket,257556.0,Yes
Suit Jacket,123303.6,Yes
Padded Denim Jacket,77063.0,Yes
Asymmetrical Wool And Silk Blend Sweater,172332.3,Yes
...,...,...
Pocket Denim Jacket,183396.0,Yes
Acid Wash Denim Jacket,172338.3,No
Boxy Fit Denim Jacket,250910.9,No
Wool Blend Jacket,147479.0,No


In [None]:
df[['Total Sales', 'Promotion']]

Unnamed: 0_level_0,Total Sales,Promotion
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Contrasting Patches Bomber Jacket,440752.0,No
Suit Jacket,257556.0,Yes
Suit Jacket,123303.6,Yes
Padded Denim Jacket,77063.0,Yes
Asymmetrical Wool And Silk Blend Sweater,172332.3,Yes
...,...,...
Pocket Denim Jacket,183396.0,Yes
Acid Wash Denim Jacket,172338.3,No
Boxy Fit Denim Jacket,250910.9,No
Wool Blend Jacket,147479.0,No


In [None]:
# Find data where the index value contains 'Suit'
df.filter(like='Suit', axis=0)

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Suit Jacket,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,No,Yes,169.0,1524,257556.0,2023
Suit Jacket,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Yes,Yes,69.9,1764,123303.6,2023
Suit Jacket In 100% Linen,182157,2023-06-07,Aisle,Man,Jackets,322972485-431-46,No,No,159.0,2421,384939.0,2023
Slim Fit Suit Jacket,180176,2023-01-13,End-cap,Man,Jackets,335342680-800-44,Yes,Yes,129.0,2220,286380.0,2023
Suit Jacket In 100% Linen,123047,2024-03-28,Aisle,Man,T-Shirts,313027279-800-2,Yes,No,109.0,2985,325365.0,2024
100% Wool Suit Jacket,141861,2023-07-15,Aisle,Man,Jackets,313854165-401-46,Yes,Yes,169.0,1916,323804.0,2023
Straight Suit Jacket,127478,2023-01-23,Front of Store,Man,Jackets,328594167-800-46,Yes,No,129.0,2498,322242.0,2023
Wool Blend Suit Jacket,123824,2023-10-14,Aisle,Man,Jackets,322901350-800-46,No,Yes,169.0,2277,384813.0,2023
Suit Jacket In 100% Linen,194410,2023-02-07,End-cap,Man,Jackets,322972473-052-46,No,No,159.0,669,106371.0,2023
Houndstooth Suit Jacket,167981,2023-05-28,End-cap,Man,Jackets,329706743-401-46,No,No,139.0,730,101470.0,2023


In [None]:
# Find the index with value 'Suit Jacket'
df.loc['Suit Jacket']

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Suit Jacket,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,No,Yes,169.0,1524,257556.0,2023
Suit Jacket,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Yes,Yes,69.9,1764,123303.6,2023


#### **Sorting Data**

In [None]:
df.head(5)

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Contrasting Patches Bomber Jacket,179801,2023-10-21,Front of Store,Man,Jackets,311307255-800-2,Yes,No,169.0,2608,440752.0,2023
Suit Jacket,183243,2023-07-01,Front of Store,Man,Jackets,328250627-251-46,No,Yes,169.0,1524,257556.0,2023
Suit Jacket,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Yes,Yes,69.9,1764,123303.6,2023
Padded Denim Jacket,171860,2023-01-04,Aisle,,T-Shirts,323646471-802-2,No,Yes,109.0,707,77063.0,2023
Asymmetrical Wool And Silk Blend Sweater,166331,2023-07-09,Aisle,Woman,Sweaters,324186867-642-2,Yes,Yes,59.9,2877,172332.3,2023


In [None]:
df.sort_values(by='Price').head(5)

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
High Collar Knit Sweater,136738,2023-08-26,Front of Store,Woman,Sweaters,287414911-803-2,Yes,No,7.99,542,4330.58,2023
Basic 100% Wool Sweater,185307,2022-12-06,Front of Store,Woman,Sweaters,300250126-401-2,Yes,No,7.99,2743,21916.57,2022
Retro Sneakers,132007,2023-02-21,Aisle,Man,Shoes,276378605-515-39,Yes,No,9.99,2491,24885.09,2023
Retro High Top Sneakers,147998,2023-10-13,Aisle,Man,Shoes,277776476-700-39,No,No,9.99,2556,25534.44,2023
Denim Shirt,171776,2023-07-09,Aisle,Man,Jeans,275600408-400-2,No,Yes,12.99,1685,21888.15,2023


In [None]:
df.sort_values(by='Price', ascending=False).head(5)

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Cropped Leather Jacket,151396,2024-03-17,Front of Store,Man,Jackets,323134418-800-3,Yes,Yes,439.0,729,320031.0,2024
Vintage Effect Leather Bomber Jacket,151925,2023-03-20,End-cap,Man,Jackets,320774184-800-97,Yes,Yes,299.0,2179,651521.0,2023
Leather Jacket,161909,2023-10-04,Aisle,Man,Jackets,315529534-705-3,No,Yes,299.0,1290,385710.0,2023
Wool Blend Textured Jacket,154016,2023-01-11,End-cap,Man,Jackets,328244979-064-48,Yes,Yes,189.0,2366,447174.0,2023
Faux Suede Bomber Jacket,174412,2023-11-29,Aisle,Man,Jackets,311282759-806-2,Yes,No,169.0,1796,303524.0,2023


In [None]:
df.sort_values(by=['Price', 'Sales Volume'], ascending=[True, False]).head(5)

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Basic 100% Wool Sweater,185307,2022-12-06,Front of Store,Woman,Sweaters,300250126-401-2,Yes,No,7.99,2743,21916.57,2022
High Collar Knit Sweater,136738,2023-08-26,Front of Store,Woman,Sweaters,287414911-803-2,Yes,No,7.99,542,4330.58,2023
Retro High Top Sneakers,147998,2023-10-13,Aisle,Man,Shoes,277776476-700-39,No,No,9.99,2556,25534.44,2023
Retro Sneakers,132007,2023-02-21,Aisle,Man,Shoes,276378605-515-39,Yes,No,9.99,2491,24885.09,2023
Purl Knit Sweater,136272,2023-07-17,End-cap,Man,Sweaters,267195405-500-3,Yes,No,12.99,2887,37502.13,2023


In [None]:
# Filter then sort
df[df['Terms']=='Jackets'].sort_values(by='Sales Volume', ascending=False)

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Plaid Tie Dye Overshirt,183825,2023-05-20,Aisle,Man,Jackets,281593208-015-2,Yes,Yes,19.99,2989,59750.11,2023
Rib Collar Jacket,113435,2023-12-08,Aisle,Man,Jackets,320278659-251-2,Yes,No,129.00,2973,383517.00,2023
Contrasting Collar Jacket,117590,2023-02-11,End-cap,Man,Jackets,320298385-807-2,No,No,79.90,2968,237143.20,2023
Double Faced Jacket,192936,2023-09-29,End-cap,Man,Jackets,312368260-800-2,Yes,No,139.00,2942,408938.00,2023
Faux Leather Bomber Jacket,172364,2023-10-24,Aisle,Man,Jackets,317782474-800-2,No,Yes,169.00,2931,495339.00,2023
...,...,...,...,...,...,...,...,...,...,...,...,...
100% Feather Fill Puffer Jacket,137121,2024-02-24,Aisle,Man,Jackets,312372602-800-2,Yes,No,169.00,656,110864.00,2024
Tuxedo Jacket,188771,2023-10-11,Aisle,Man,Jackets,324052738-800-46,No,No,169.00,654,110526.00,2023
Hooded Technical Jacket,187180,2023-03-09,Front of Store,Man,Jackets,312596416-800-2,Yes,No,109.00,647,70523.00,2023
Textured Pocket Jacket,116228,2023-03-04,End-cap,Man,Jackets,312363708-800-2,No,Yes,89.90,647,58165.30,2023


In [None]:
# Particular Columns only (one line)
df[df['Terms'] == 'Jackets'].sort_values(by='Sales Volume', ascending=False)[['Price','Sales Volume']]

Unnamed: 0_level_0,Price,Sales Volume
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Plaid Tie Dye Overshirt,19.99,2989
Rib Collar Jacket,129.00,2973
Contrasting Collar Jacket,79.90,2968
Double Faced Jacket,139.00,2942
Faux Leather Bomber Jacket,169.00,2931
...,...,...
100% Feather Fill Puffer Jacket,169.00,656
Tuxedo Jacket,169.00,654
Hooded Technical Jacket,109.00,647
Textured Pocket Jacket,89.90,647


In [None]:
# Particular columns assign first to variable
jackets = df[df['Terms'] == 'Jackets'].sort_values(by='Sales Volume', ascending=False)
jackets[['Price','Sales Volume']]

Unnamed: 0_level_0,Price,Sales Volume
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Plaid Tie Dye Overshirt,19.99,2989
Rib Collar Jacket,129.00,2973
Contrasting Collar Jacket,79.90,2968
Double Faced Jacket,139.00,2942
Faux Leather Bomber Jacket,169.00,2931
...,...,...
100% Feather Fill Puffer Jacket,169.00,656
Tuxedo Jacket,169.00,654
Hooded Technical Jacket,109.00,647
Textured Pocket Jacket,89.90,647


## **Grouping Data**

To group data of similar categories and apply a function to the categories. Allowing for efficient analysis and aggregation of grouped data

#### **The 3 Steps of a Groupby Process**

Any groupby process involves some combination of the following 3 steps:
1. **Splitting** the original object into groups by applying some conditions on the dataset.
2. **Applying** a function to each group.
      *   Aggregation - sum, mean, count, min, and max
      *   Transformation - standardize data (z-score), filling NAs within groups with a value derived from each group
      *   Filtration - discard data that belong to groups with only a few members or filter out data based on the group sum or mean.


3. **Combining** the results.




You may be familiar with the term GroupBy if you have seen an SQL query before. It looks like:
```
SELECT Column1, Column2, mean(Column3), sum(Column4)
FROM SomeTable
GROUP BY Column1, Column2
```





### **Splitting** the original object into groups.

* From clothing, we get a GroupBy object by calling the `groupby()` method.

* Name has unique values so there is no point in grouping them.

* We look for a column where there are similar values in different rows (e.g. Terms and Product Postion)

In [None]:
df.groupby(df['Terms'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7a0a2bc00f10>

In [None]:
df.groupby('Terms')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7a0a2bd72fe0>

In [None]:
terms = df.groupby('Terms')

In [None]:
terms.ngroups

5

In [None]:
terms.size()

Unnamed: 0_level_0,0
Terms,Unnamed: 1_level_1
Jackets,142
Jeans,8
Shoes,31
Sweaters,41
T-Shirts,32


In [None]:
terms.get_group('T-Shirts')

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,Terms,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Suit Jacket,177771,2023-01-22,End-cap,Man,T-Shirts,281883711-071-2,Yes,Yes,69.9,1764,123303.6,2023
Padded Denim Jacket,171860,2023-01-04,Aisle,,T-Shirts,323646471-802-2,No,Yes,109.0,707,77063.0,2023
Heart Print T-Shirt,129898,2023-12-22,End-cap,Man,T-Shirts,322677489-800-2,No,No,39.9,1866,74453.4,2023
Striped Jacquard T-Shirt,147690,2024-02-03,Aisle,Man,T-Shirts,322909502-819-2,No,Yes,39.9,881,35151.9,2024
Oversize Fit T-Shirt,182099,2023-11-04,Aisle,Man,T-Shirts,330478123-712-2,No,Yes,39.9,1978,78922.2,2023
Ribbed Knit T-Shirt,158958,2023-04-29,End-cap,Man,T-Shirts,320111404-428-2,No,No,45.9,903,41447.7,2023
Faux Leather Bomber Jacket,172015,2023-08-14,End-cap,Man,T-Shirts,313008692-514-2,Yes,Yes,139.0,2622,364458.0,2023
Printed Cropped Fit Shirt Limited Edition,154846,2023-04-05,End-cap,Man,T-Shirts,328232452-898-2,No,Yes,39.9,917,36588.3,2023
Lightweight Bomber Jacket,168516,2023-06-30,Front of Store,Man,T-Shirts,336446858-822-2,No,Yes,89.9,2168,194903.2,2023
Leather Jacket,149487,2024-01-12,Aisle,Man,T-Shirts,312978838-611-2,Yes,No,99.9,1258,125674.2,2024


### **Applying** a function to each group.

#### `.sum(), .mean(), .min()`

In [None]:
df.groupby('Terms').sum()

Unnamed: 0_level_0,Product ID,Release,Product Position,Section,SKU,Seasonal,Promotion,Price,Sales Volume,Total Sales,Year Released
Terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Jackets,21910583,2023-10-212023-07-012024-03-082024-01-152023-0...,Front of StoreFront of StoreEnd-capAisleAisleF...,ManManManManManManManManManManManManManManManM...,311307255-800-2328250627-251-46315758723-401-2...,YesNoYesNoNoYesNoNoNoYesYesYesNoNoYesYesNoYesN...,NoYesNoYesYesNoYesYesNoNoNoYesNoNoNoNoNoYesYes...,16216.01,262367,28993706.87,287279
Jeans,1311535,2023-07-092023-02-222023-11-292022-12-312024-0...,AisleFront of StoreEnd-capEnd-capFront of Stor...,ManManManManManManManMan,275600408-400-2324908098-800-38326540983-802-3...,NoNoNoYesYesYesNoNo,YesNoNoYesYesNoNoNo,511.39,13320,864385.25,16185
Shoes,4654028,2023-02-032023-05-142023-02-212023-10-302023-0...,Front of StoreFront of StoreAisleEnd-capEnd-ca...,ManManManManManManManManManManManManManManManM...,314937773-800-39311287075-120-39276378605-515-...,YesYesYesYesNoYesNoNoYesNoYesYesNoNoNoYesNoNoN...,NoNoNoYesYesYesYesYesYesNoYesNoYesNoNoYesNoNoN...,2011.08,57906,3754837.63,62715
Sweaters,6191500,2023-07-092023-04-242023-11-072023-03-052023-0...,AisleEnd-capFront of StoreFront of StoreEnd-ca...,WomanWomanWomanManWomanManWomanManWomanWomanWo...,324186867-642-2324597432-800-2323175653-803-23...,YesNoNoYesNoYesNoYesNoYesNoYesYesYesYesNoYesNo...,YesYesYesNoYesNoNoNoNoYesYesNoNoNoYesNoNoYesYe...,2249.37,75242,4090631.48,82942
T-Shirts,4908176,2023-01-222023-01-042023-12-222024-02-032023-1...,End-capAisleEnd-capAisleAisleEnd-capEnd-capEnd...,ManManManManManManManManManManManManManManManM...,281883711-071-2323646471-802-2322677489-800-23...,YesNoNoNoNoNoYesNoNoYesNoNoYesNoYesYesYesNoNoN...,YesYesNoYesYesNoYesYesYesNoYesNoNoYesYesYesNoN...,2157.69,53637,3696806.25,64742


In [None]:
df.groupby('Terms')['Sales Volume'].sum()

Unnamed: 0_level_0,Sales Volume
Terms,Unnamed: 1_level_1
Jackets,262367
Jeans,13320
Shoes,57906
Sweaters,75242
T-Shirts,53637


In [None]:
df.groupby('Terms')['Price'].mean().reset_index()

Unnamed: 0,Terms,Price
0,Jackets,114.197254
1,Jeans,63.92375
2,Shoes,64.873548
3,Sweaters,54.862683
4,T-Shirts,67.427813


In [None]:
df.groupby('Terms')['Price'].max()

Unnamed: 0_level_0,Price
Terms,Unnamed: 1_level_1
Jackets,439.0
Jeans,109.0
Shoes,99.9
Sweaters,169.0
T-Shirts,169.0


In [None]:
# Sum of all numeric columns
df.groupby('Terms').sum(numeric_only=True)

#### `.agg()`

In [None]:
df.groupby('Terms').agg({'Sales Volume':'sum', 'Price':'mean'})

In [None]:
df.groupby('Terms').agg({'Sales Volume': ['mean', 'min', 'max'], 'Price': ['mean', 'min', 'max']})

In [None]:
# groupby() on 2 or more columns
df.groupby(['Terms', 'Product Position'])['Sales Volume'].sum()

In [None]:
df.groupby(['Product Position', 'Terms'])['Sales Volume'].sum()

#### `dropna=False`
By default NA values are excluded from group keys during the groupby operation. If you want to include, pass `dropna=False`

In [None]:
df

In [None]:
df['Section'].unique()

In [None]:
df.groupby('Section')['Sales Volume'].sum()

In [None]:
df['Section'].isnull().sum()

In [None]:
df['Sales Volume'].sum()

In [None]:
df.groupby('Section')['Sales Volume'].sum().sum()

454342

In [None]:
df.groupby('Section', dropna=False)['Sales Volume'].sum()

Unnamed: 0_level_0,Sales Volume
Section,Unnamed: 1_level_1
Man,390968
Woman,63374
,8130


In [None]:
df.groupby('Section', dropna=False)['Sales Volume'].sum().sum()

462472