In [1]:
import numpy as np, pandas as pd, os, sqlite3
from datetime import datetime as dt

In [2]:
conn = sqlite3.connect('datalake.db')
cursor = conn.cursor()

In [3]:
# Ensuring tables are present 
res = cursor.execute('''select name from sqlite_master''')
res.fetchall()

[('users',),
 ('receipts',),
 ('sqlite_autoindex_receipts_1',),
 ('brands',),
 ('sqlite_autoindex_brands_1',)]

## Data Quality Checks

### 1. Users table

In [4]:
users = pd.read_sql('''
select id as user_id
, active
, date(DATETIME(ROUND(createdDate / 1000), 'unixepoch')) as reg_date
, date(DATETIME(ROUND(lastLogin / 1000), 'unixepoch')) as last_login_date
, signUpSource
, state
from users''', conn, parse_dates=['reg_date', 'last_login_date'])

In [5]:
users.sample(10)

Unnamed: 0,user_id,active,reg_date,last_login_date,signUpSource,state
30,5ff36c8e135e7011bcb85da4,1,2021-01-04,2021-01-04,Email,WI
124,5ffc8ff9b3348b11c9338896,1,2021-01-11,2021-01-11,Email,WI
272,600f47f06fd0dc1768a34a12,1,2021-01-25,2021-01-25,Email,WI
223,6006f786fb296c7f688530f8,1,2021-01-19,NaT,Email,AL
95,5ff73b90eb7c7d31ca8a452b,1,2021-01-07,2021-01-07,Email,WI
320,6010be6673c60b12049040ef,1,2021-01-27,2021-01-27,Email,WI
363,60189c74c8b50e11d8454eff,1,2021-02-02,2021-02-02,Email,WI
232,60085b3dbe5fc90ee6a67b11,1,2021-01-20,2021-01-20,Email,WI
211,60074b996e64691717e8f11a,1,2021-01-19,2021-01-19,Email,WI
404,6024399defa60112282c0ac9,1,2021-02-10,2021-02-10,Email,WI


In [6]:
users.dtypes

user_id                    object
active                      int64
reg_date           datetime64[ns]
last_login_date    datetime64[ns]
signUpSource               object
state                      object
dtype: object

In [7]:
users.isnull().sum()

user_id             0
active              0
reg_date            0
last_login_date    62
signUpSource       48
state              56
dtype: int64

In [8]:
users.isnull().sum()/len(users)

user_id            0.000000
active             0.000000
reg_date           0.000000
last_login_date    0.125253
signUpSource       0.096970
state              0.113131
dtype: float64

In [9]:
#Values of Active Flag
users['active'].value_counts()

1    494
0      1
Name: active, dtype: int64

In [10]:
#Values of State
users['state'].value_counts()

WI    396
NH     20
AL     12
OH      5
IL      3
KY      1
CO      1
SC      1
Name: state, dtype: int64

In [11]:
#values of Signupsource
users['signUpSource'].value_counts()

Email     443
Google      4
Name: signUpSource, dtype: int64

In [12]:
# Checking whether lastlogindate is less than registration date
users.loc[users['last_login_date'] < users['reg_date']]

Unnamed: 0,user_id,active,reg_date,last_login_date,signUpSource,state


In [13]:
print('''
Total number of records in users table: {0} \n
Total number of unique users: {1}
      '''.format(users.shape[0], users['user_id'].nunique()))


Total number of records in users table: 495 

Total number of unique users: 212
      


The number of unique users in the users table is 212 only. However the users table has 495 records.
Ideally the table must be unique in terms of id column. 
Upon further inspection it was found out that theses are duplicated rows (other column values being the same). A unique key contraint would have helped to avoid that

### 2. Receipts table

In [14]:
receipts = pd.read_sql('''
select id as receipt_id
,bonusPointsEarned
,bonusPointsEarnedReason
,(DATETIME(ROUND(createDate / 1000), 'unixepoch')) as created_date
,(DATETIME(ROUND(dateScanned / 1000), 'unixepoch')) as scanned_date
,(DATETIME(ROUND(finishedDate / 1000), 'unixepoch')) as finished_date
,(DATETIME(ROUND(modifyDate / 1000), 'unixepoch')) as modify_date
,(DATETIME(ROUND(pointsAwardedDate / 1000), 'unixepoch')) as points_awarded_date
,(DATETIME(ROUND(purchaseDate / 1000), 'unixepoch')) as purchase_date

,pointsEarned
,totalSpent
,rewardsReceiptStatus
,userId as user_id
from receipts''', conn, parse_dates=['created_date', 'scanned_date', 'finished_date',
                                     'modify_date', 'points_awarded_date', 'purchase_date'])

In [15]:
receipts.sample(10)

Unnamed: 0,receipt_id,bonusPointsEarned,bonusPointsEarnedReason,created_date,scanned_date,finished_date,modify_date,points_awarded_date,purchase_date,pointsEarned,totalSpent,rewardsReceiptStatus,user_id
1099,603d40250a720fde10000459,25.0,COMPLETE_NONPARTNER_RECEIPT,2021-03-01 19:27:33,2021-03-01 19:27:33,NaT,2021-03-01 19:27:34,NaT,2020-08-17 00:00:00,25.0,34.96,REJECTED,5fc961c3b8cfca11a077dd33
32,5ff36c750a7214ada100058f,,,2021-01-04 19:28:53,2021-01-04 19:28:53,2021-01-04 19:28:54,2021-01-04 19:28:54,2021-01-04 19:28:54,2021-01-03 19:28:53,500.0,89.91,FINISHED,5ff36be7135e7011bcb856d3
403,6009885a0a7214ad89000129,45.0,COMPLETE_PARTNER_RECEIPT,2021-01-21 13:57:46,2021-01-21 13:57:46,2021-01-21 13:57:47,2021-01-21 13:57:47,2021-01-21 13:57:47,2021-01-20 13:57:46,50.0,1.0,FINISHED,54943462e4b07e684157a532
130,5ff74faf0a720f0523000613,750.0,"Receipt number 1 completed, bonus point schedu...",2021-01-07 18:15:11,2021-01-07 18:15:11,2021-01-07 18:15:13,2021-01-07 18:15:13,2021-01-07 18:15:13,2021-01-06 18:15:11,841.2,18.2,FINISHED,5ff74f3db3348b11c93361d1
718,601b14240a720f05f400021e,,,2021-02-03 21:22:44,2021-02-03 21:22:44,NaT,2021-02-03 21:22:44,NaT,NaT,,,SUBMITTED,5fc961c3b8cfca11a077dd33
326,6005b60f0a720f05f3000096,,,2021-01-18 16:23:41,2021-01-18 16:23:41,NaT,2021-01-18 16:23:41,NaT,NaT,,,SUBMITTED,5a43c08fe4b014fd6b6a0612
476,600fb21a0a720f053500004f,5.0,All-receipts receipt bonus,2021-01-26 06:09:30,2021-01-26 06:09:30,2021-01-26 06:09:30,2021-01-26 06:09:30,2021-01-26 06:09:30,2021-01-23 06:00:00,5.0,49.95,FINISHED,600fb1ac73c60b12049027bb
985,60268c7e0a7214d8e9000309,100.0,"Receipt number 6 completed, bonus point schedu...",2021-02-12 14:11:10,2021-02-12 14:11:10,2021-02-12 14:11:11,2021-02-12 14:11:16,2021-02-12 14:11:11,2021-02-12 00:00:00,100.0,29.0,FINISHED,60268c7bb545931ac63683af
380,600788cd0a7214ad8900003a,21.0,COMPLETE_PARTNER_RECEIPT,2021-01-20 01:35:09,2021-01-20 01:35:09,2021-01-20 01:35:30,2021-01-20 01:35:32,2021-01-20 01:35:30,2021-01-19 00:00:00,50.9,2.99,FINISHED,5cd06f1f2acf0a2667da628b
1074,603bdbe10a7217c72c00033e,25.0,COMPLETE_NONPARTNER_RECEIPT,2021-02-28 18:07:29,2021-02-28 18:07:29,NaT,2021-02-28 18:07:30,NaT,2020-08-17 00:00:00,25.0,34.96,REJECTED,5fc961c3b8cfca11a077dd33


In [16]:
receipts.dtypes

receipt_id                         object
bonusPointsEarned                 float64
bonusPointsEarnedReason            object
created_date               datetime64[ns]
scanned_date               datetime64[ns]
finished_date              datetime64[ns]
modify_date                datetime64[ns]
points_awarded_date        datetime64[ns]
purchase_date              datetime64[ns]
pointsEarned                      float64
totalSpent                        float64
rewardsReceiptStatus               object
user_id                            object
dtype: object

In [17]:
receipts.isnull().sum()

receipt_id                   0
bonusPointsEarned          575
bonusPointsEarnedReason    575
created_date                 0
scanned_date                 0
finished_date              551
modify_date                  0
points_awarded_date        582
purchase_date              448
pointsEarned               510
totalSpent                 435
rewardsReceiptStatus         0
user_id                      0
dtype: int64

In [18]:
receipts.isnull().sum()/len(receipts)

receipt_id                 0.000000
bonusPointsEarned          0.513852
bonusPointsEarnedReason    0.513852
created_date               0.000000
scanned_date               0.000000
finished_date              0.492404
modify_date                0.000000
points_awarded_date        0.520107
purchase_date              0.400357
pointsEarned               0.455764
totalSpent                 0.388740
rewardsReceiptStatus       0.000000
user_id                    0.000000
dtype: float64

In [19]:
receipts['rewardsReceiptStatus'].value_counts()

FINISHED     518
SUBMITTED    434
REJECTED      71
PENDING       50
FLAGGED       46
Name: rewardsReceiptStatus, dtype: int64

In [20]:
receipts['isPointsGiven'] = np.where(receipts['pointsEarned'].isnull(),0,1)

In [21]:
receipts.groupby(['rewardsReceiptStatus'])['isPointsGiven'].agg(['count', 'sum'])

Unnamed: 0_level_0,count,sum
rewardsReceiptStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
FINISHED,518,518
FLAGGED,46,33
PENDING,50,0
REJECTED,71,58
SUBMITTED,434,0


In [22]:
#Checking if points awarded date is less than scanned_date
receipts.loc[receipts['points_awarded_date']< receipts['scanned_date']]

Unnamed: 0,receipt_id,bonusPointsEarned,bonusPointsEarnedReason,created_date,scanned_date,finished_date,modify_date,points_awarded_date,purchase_date,pointsEarned,totalSpent,rewardsReceiptStatus,user_id,isPointsGiven


In [23]:
print("Earliest Scan Date: ", receipts['scanned_date'].min(), 
      ", Latest Scan Date: ", receipts['scanned_date'].max())

Earliest Scan Date:  2020-10-30 20:17:59 , Latest Scan Date:  2021-03-01 23:17:34


In [24]:
receipts.loc[~receipts['user_id'].isin(list(users['user_id'])), 'user_id'].nunique()

117

There are 117 users who are not present in the users table. Ideally, the userIds in receipts table should be a subset of all user ids present in the users table.
 

## Analyses

#### Analytical query for unnesting branads scanned

In [25]:
brands_scan = pd.read_sql('''
select rec.id, userId, coalesce(upper(b.name), upper(rec.description)) as brand_name, b.categoryCode, b.topBrand
,description, scanned_date, created_date, finished_date, rewardsReceiptStatus, pointsEarned, totalSpent
,purchasedItemCount
from (
select r.id, r.UserId 
,CAST(JSON_EXTRACT(j.value, '$.barcode') as VARCHAR) as barcode 
,CAST(JSON_EXTRACT(j.value, '$.rewardsProductPartnerId') as VARCHAR) as rewardsProductPartnerId 
,CAST(JSON_EXTRACT(j.value, '$.description') as VARCHAR) as description 
,DATETIME(ROUND(dateScanned / 1000), 'unixepoch') as scanned_date
,DATETIME(ROUND(createDate / 1000), 'unixepoch') as created_date
,DATETIME(ROUND(finishedDate / 1000), 'unixepoch') as finished_date
,rewardsReceiptStatus
,pointsEarned
,totalSpent
,purchasedItemCount
from receipts r, json_each(rewardsReceiptItemList) j
--where DATE(DATETIME(ROUND(dateScanned / 1000), 'unixepoch'), 'start of month') in (date('2021-01-01'),date('2021-02-01'))
) rec 
left join brands b
on rec.rewardsProductPartnerId = b.cpg_id''', conn, parse_dates=['reg_date', 'last_login_date'])

In [26]:
brands_scan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142700 entries, 0 to 142699
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    142700 non-null  object 
 1   UserId                142700 non-null  object 
 2   brand_name            142047 non-null  object 
 3   categoryCode          2348 non-null    object 
 4   topBrand              137460 non-null  float64
 5   description           139791 non-null  object 
 6   scanned_date          142700 non-null  object 
 7   created_date          142700 non-null  object 
 8   finished_date         136782 non-null  object 
 9   rewardsReceiptStatus  142700 non-null  object 
 10  pointsEarned          141572 non-null  float64
 11  totalSpent            142265 non-null  float64
 12  purchasedItemCount    142216 non-null  float64
dtypes: float64(4), object(9)
memory usage: 14.2+ MB


In [27]:
brands_scan.isnull().sum()

id                           0
UserId                       0
brand_name                 653
categoryCode            140352
topBrand                  5240
description               2909
scanned_date                 0
created_date                 0
finished_date             5918
rewardsReceiptStatus         0
pointsEarned              1128
totalSpent                 435
purchasedItemCount         484
dtype: int64

In [28]:
brands_scan['brand_name'].fillna('ITEM NOT FOUND', inplace=True)
brands_scan['scanned_month'] = pd.to_datetime(brands_scan['scanned_date']).dt.to_period('M').astype(str)

In [29]:
brands_scan.groupby(['scanned_month'])['id'].count()

scanned_month
2020-10       284
2020-11      1257
2021-01    138999
2021-02      2117
2021-03        43
Name: id, dtype: int64

We see that most of the receipts have been scanned in the month of Jan-2021

### 1. What are the top 5 brands by receipts scanned for most recent month?

The receipts scan dates lie in thrange of October 30,2020 to March 1, 2021. Since the month of March has not been completed, we should assume the most recent month to be Feb-2021.

In [30]:
brands_scan.loc[brands_scan['scanned_month']== '2021-02', 'brand_name'].value_counts().nlargest(6)

ITEM NOT FOUND                                                                                                                                                 388
FLIPBELT LEVEL TERRAIN WAIST POUCH, NEON YELLOW, LARGE/32-35                                                                                                    28
THINDUST SUMMER FACE MASK - SUN PROTECTION NECK GAITER FOR OUTDOORACTIVITIES                                                                                    27
MUELLER AUSTRIA HYPERGRIND PRECISION ELECTRIC SPICE/COFFEE GRINDER MILLWITH LARGE GRINDING CAPACITY AND HD MOTOR ALSO FOR SPICES, HERBS, NUTS,GRAINS, WHITE     27
DELIMEX                                                                                                                                                         11
GOOD SEASONS                                                                                                                                                    11
Name: brand_name, dtyp

### Answer 1
Removing the "ITEM NOT FOUND" entries, the top 5 brands scanned for the month of Feb-2021 are

1. FLIPBELT LEVEL TERRAIN WAIST POUCH, NEON YELLOW, LARGE/32-35
2. THINDUST SUMMER FACE MASK - SUN PROTECTION NECK GAITER FOR OUTDOORACTIVITIES 
3. MUELLER AUSTRIA HYPERGRIND PRECISION ELECTRIC SPICE/COFFEE GRINDER MILLWITH LARGE GRINDING CAPACITY AND HD MOTOR ALSO FOR SPICES, HERBS, NUTS,GRAINS, WHITE
4. DELIMEX           
5. GOOD SEASONS

### 2. How does the ranking of the top 5 brands by receipts scanned for the recent month compare to the ranking for the previous month?

Here we compare the same metrics but for scanned month of Jan-2021

In [31]:
brands_scan.loc[brands_scan['scanned_month']== '2021-01', 'brand_name'].value_counts().nlargest(10)

DIGIORNO CHEESE                      866
COOL WHIP                            864
GODIVA INSTANT PUDDING MIX           863
DEVOUR                               863
F. WHITLOCK & SONS BBQ SAUCE         863
FOOD NETWORK KITCHEN INSPIRATIONS    863
FAT RABBIT                           863
FRUIT LOVE SPOONABLE SMOOTHIES       863
A.1.                                 863
GEVALIA                              863
Name: brand_name, dtype: int64

### Answer 2
The top 5 brands scanned for the month of Jan-2021 are

1. DIGIORNO CHEESE
2. COOL WHIP
3. GODIVA INSTANT PUDDING MIX
4. DEVOUR           
5. F. WHITLOCK & SONS BBQ SAUCE


The more popular brands scanned in Jan-2021 lie in the Food and Dining brands, whereas Feb-2021 brands are mixed with retail and dining

### 3. When considering average spend from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?

### 4. When considering total number of items purchased from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?

Here we can use the receipts table directly, as we do not need to go on a brand level

In [32]:
receipts = pd.read_sql('''select * from receipts''', 
                       conn, 
                       parse_dates=['reg_date', 'last_login_date'])

In [33]:
receipts

Unnamed: 0,id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",1609687531000,1609687531000,1.609688e+12,1609687536000,1.609688e+12,500.0,1.609632e+12,5.0,"[{""barcode"": ""4011"", ""description"": ""ITEM NOT ...",FINISHED,26.00,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",1609687483000,1609687483000,1.609687e+12,1609687488000,1.609687e+12,150.0,1.609601e+12,2.0,"[{""barcode"": ""4011"", ""description"": ""ITEM NOT ...",FINISHED,11.00,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,1609687537000,1609687537000,,1609687542000,,5.0,1.609632e+12,1.0,"[{""needsFetchReview"": false, ""partnerItemId"": ...",REJECTED,10.00,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,5.0,All-receipts receipt bonus,1609687534000,1609687534000,1.609688e+12,1609687539000,1.609688e+12,5.0,1.609632e+12,4.0,"[{""barcode"": ""4011"", ""description"": ""ITEM NOT ...",FINISHED,28.00,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,5.0,All-receipts receipt bonus,1609687506000,1609687506000,1.609688e+12,1609687511000,1.609688e+12,5.0,1.609601e+12,2.0,"[{""barcode"": ""4011"", ""description"": ""ITEM NOT ...",FINISHED,1.00,5ff1e194b6a9d73a3a9f1052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,603cc0630a720fde100003e6,25.0,COMPLETE_NONPARTNER_RECEIPT,1614594147000,1614594147000,,1614594148000,,25.0,1.597622e+12,2.0,"[{""barcode"": ""B076FJ92M4"", ""description"": ""mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33
1115,603d0b710a720fde1000042a,,,1614613361873,1614613361873,,1614613361873,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1116,603cf5290a720fde10000413,,,1614607657664,1614607657664,,1614607657664,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1117,603ce7100a7217c72c000405,25.0,COMPLETE_NONPARTNER_RECEIPT,1614604048000,1614604048000,,1614604049000,,25.0,1.597622e+12,2.0,"[{""barcode"": ""B076FJ92M4"", ""description"": ""mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33


In [34]:
receipts.groupby(['rewardsReceiptStatus']).agg(avg_spends = ('totalSpent', np.mean),
                                               total_items_purchased = ('purchasedItemCount', sum))

Unnamed: 0_level_0,avg_spends,total_items_purchased
rewardsReceiptStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
FINISHED,80.854305,8184.0
FLAGGED,180.451739,1014.0
PENDING,28.032449,0.0
REJECTED,23.326056,173.0
SUBMITTED,,0.0


### Answer 3 
We can see that for rewardsReceiptStatus = 'FINISHED' (meaning ACCEPTED), the average spends (80.9) is higher
than average spends of rewardsReceiptStatus = 'REJECTED' (23.3)


### Answer 4
We can see that for rewardsReceiptStatus = 'FINISHED' (meaning ACCEPTED), the total number of items purchased (8184) is higher than total number of items purchased of rewardsReceiptStatus = 'REJECTED' (173)




### 5. Which brand has the most spend among users who were created within the past 6 months?
### 6. Which brand has the most transactions among users who were created within the past 6 months?

For this we use the brands scanned table to find item level transactions and pricing, but with a filter of users registered within the last 6 months (Assuming Mar 1, 2021 as the cutoff date)

In [35]:
brands_scan_users = pd.read_sql('''
select rec.id, userId, u.reg_date, coalesce(upper(b.name), upper(rec.description)) as brand_name
,b.categoryCode, b.topBrand, description, scanned_date, created_date, finished_date
,coalesce(finalPrice, userFlaggedPrice) as price
from (
select r.id, r.UserId 
,CAST(JSON_EXTRACT(j.value, '$.barcode') as VARCHAR) as barcode 
,CAST(JSON_EXTRACT(j.value, '$.rewardsProductPartnerId') as VARCHAR) as rewardsProductPartnerId 
,CAST(JSON_EXTRACT(j.value, '$.description') as VARCHAR) as description 
,CAST(JSON_EXTRACT(j.value, '$.finalPrice') as REAL) as finalPrice 
,CAST(JSON_EXTRACT(j.value, '$.userFlaggedPrice') as REAL) as userFlaggedPrice 
,DATETIME(ROUND(dateScanned / 1000), 'unixepoch') as scanned_date
,DATETIME(ROUND(createDate / 1000), 'unixepoch') as created_date
,DATETIME(ROUND(finishedDate / 1000), 'unixepoch') as finished_date
,rewardsReceiptStatus
,pointsEarned
,totalSpent
,purchasedItemCount
from receipts r, json_each(rewardsReceiptItemList) j
) rec 
left join brands b
on rec.rewardsProductPartnerId = b.cpg_id
left join (select distinct id, date(DATETIME(ROUND(createdDate / 1000), 'unixepoch')) as reg_date 
from users) u
on rec.userId = u.id
where u.reg_date between date('2020-09-01') and date('2021-03-01')
''', conn)

In [36]:
brands_scan_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101479 entries, 0 to 101478
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             101479 non-null  object 
 1   UserId         101479 non-null  object 
 2   reg_date       101479 non-null  object 
 3   brand_name     100989 non-null  object 
 4   categoryCode   1632 non-null    object 
 5   topBrand       98386 non-null   float64
 6   description    99208 non-null   object 
 7   scanned_date   101479 non-null  object 
 8   created_date   101479 non-null  object 
 9   finished_date  97160 non-null   object 
 10  price          101044 non-null  float64
dtypes: float64(2), object(9)
memory usage: 8.5+ MB


In [37]:
brands_scan_users.loc[brands_scan_users['price'].isnull(), 'brand_name'].value_counts()

Series([], Name: brand_name, dtype: int64)

In [38]:
brands_scan_users['reg_date'].min()

'2020-11-04'

In [39]:
brands_scan_users['reg_date'].max()

'2021-02-12'

In [40]:
brands = brands_scan_users.groupby(['brand_name']).agg(spends = ('price', sum),
                                              transactions = ('id', 'count')
                                             ).reset_index()

In [41]:
#Brand with most spends
brands.sort_values(by=['spends'],ascending=False).head(1)

Unnamed: 0,brand_name,spends,transactions
914,PULL-UPS,4647.66,350


In [42]:
#Brand with most transactions
brands.sort_values(by=['transactions'],ascending=False).head(1)

Unnamed: 0,brand_name,spends,transactions
286,DIGIORNO CHEESE,4634.17,642


### Answer 5
Brand with most spends: PULL-UPS (4647)

### Answer 6
Brand with most transactions: DIGIORNO CHEESE (642)
