In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
dirty_cafe = pd.read_csv('dirty_cafe_sales.csv')
dirty_cafe.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


## Data Wrangling

In [3]:
print("This dataset has {} samples and {} features.".format(dirty_cafe.shape[0], dirty_cafe.shape[1]))

This dataset has 10000 samples and 8 features.


In [4]:
# Check for missing values
missing_values = dirty_cafe.isnull().sum()
missing_values

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [5]:
# Check for duplicates
duplicates = dirty_cafe.duplicated().sum()
duplicates

np.int64(0)

In [7]:
# Check for features missing more than 50% of their values we will drop them as that will not be useful
missing_values[missing_values > dirty_cafe.shape[0]/2] # No features missing more than 50% of their values

# Check for features with only one unique value
unique_values = dirty_cafe.nunique()
unique_values[unique_values == 1]
print("There are {} features with only one unique value.".format(unique_values[unique_values == 1].shape[0]))

There are 0 features with only one unique value.


In [8]:
# Check missing values in the dataset
missing_values = dirty_cafe.isnull().sum()
missing_values

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [11]:
# Item column 
# Check for values in the item column
dirty_cafe['Item'].value_counts()


Item
Juice       1171
Coffee      1165
Salad       1148
Cake        1139
Sandwich    1131
Smoothie    1096
Cookie      1092
Tea         1089
UNKNOWN      344
ERROR        292
Name: count, dtype: int64

In [12]:
# Replace the UNKNOWN and ERROR values with a new item - burger and then the rest with the most frequent item
dirty_cafe['Item'] = dirty_cafe['Item'].replace(['UNKNOWN', 'ERROR'], 'burger')
dirty_cafe['Item'] = dirty_cafe['Item'].replace(np.nan, 'burger')

# Check for missing values in the item column
dirty_cafe['Item'].isnull().sum()

np.int64(0)

In [14]:
# Quantity column
# Convert Quantity to numeric type
dirty_cafe['Quantity'] = pd.to_numeric(dirty_cafe['Quantity'], errors='coerce')
# Check for negative values
dirty_cafe[dirty_cafe['Quantity'] < 0]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [15]:
# Quantity column value counts
dirty_cafe['Quantity'].value_counts()

Quantity
5.0    2013
2.0    1974
4.0    1863
3.0    1849
1.0    1822
Name: count, dtype: int64

In [17]:
# Missing values in the Quantity column will be replaced with the median
dirty_cafe['Quantity'] = dirty_cafe['Quantity'].replace(np.nan, dirty_cafe['Quantity'].median())

# Check for missing values in the Quantity column
dirty_cafe['Quantity'].isnull().sum()

np.int64(0)

In [18]:
# Price Per Unit column
# Convert Price Per Unit to numeric type
dirty_cafe['Price Per Unit'] = pd.to_numeric(dirty_cafe['Price Per Unit'], errors='coerce')
# Check for negative values
dirty_cafe[dirty_cafe['Price Per Unit'] < 0]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [19]:
# Just like the Quantity column, missing values in the Price Per Unit column will be replaced with the median
dirty_cafe['Price Per Unit'] = dirty_cafe['Price Per Unit'].replace(np.nan, dirty_cafe['Price Per Unit'].median())

# Check for missing values in the Price Per Unit column
dirty_cafe['Price Per Unit'].isnull().sum()

np.int64(0)

In [20]:
# Price Per Unit column value counts
dirty_cafe['Price Per Unit'].value_counts()

Price Per Unit
3.0    2962
4.0    2331
2.0    1227
5.0    1204
1.0    1143
1.5    1133
Name: count, dtype: int64

In [21]:
# Total Spent column
# Convert Total Spent to numeric type
dirty_cafe['Total Spent'] = pd.to_numeric(dirty_cafe['Total Spent'], errors='coerce')
# Check for negative values
dirty_cafe[dirty_cafe['Total Spent'] < 0]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [22]:
# Total Spent column value counts
dirty_cafe['Total Spent'].value_counts()

Total Spent
6.0     979
12.0    939
3.0     930
4.0     923
20.0    746
15.0    734
8.0     677
10.0    524
2.0     497
9.0     479
5.0     468
16.0    444
25.0    259
7.5     237
1.0     232
4.5     225
1.5     205
Name: count, dtype: int64

In [23]:
# Check if there are any values with explicit UNKNOWN or ERROR
dirty_cafe['Total Spent'].value_counts().sort_index()

Total Spent
1.0     232
1.5     205
2.0     497
3.0     930
4.0     923
4.5     225
5.0     468
6.0     979
7.5     237
8.0     677
9.0     479
10.0    524
12.0    939
15.0    734
16.0    444
20.0    746
25.0    259
Name: count, dtype: int64

In [25]:
# Missing values in the Total Spent column will be replaced with the median
dirty_cafe['Total Spent'] = dirty_cafe['Total Spent'].replace(np.nan, dirty_cafe['Total Spent'].median())

# Check for missing values in the Total Spent column
dirty_cafe['Total Spent'].isnull().sum()

np.int64(0)

In [26]:
# Payment Method column
# Check for values in the Payment Method column
dirty_cafe['Payment Method'].value_counts()

Payment Method
Digital Wallet    2291
Credit Card       2273
Cash              2258
ERROR              306
UNKNOWN            293
Name: count, dtype: int64

In [27]:
# Replace the UNKNOWN and ERROR values with a new payment method - crypto and then the rest with the most frequent payment method
dirty_cafe['Payment Method'] = dirty_cafe['Payment Method'].replace(['UNKNOWN', 'ERROR'], 'Crypto')

# Let the rest of the missing values be filled with the most frequent payment method
dirty_cafe['Payment Method'] = dirty_cafe['Payment Method'].replace(np.nan, dirty_cafe['Payment Method'].mode()[0])

# Check for missing values in the Payment Method column
dirty_cafe['Payment Method'].isnull().sum()


np.int64(0)