# Installing & Importing 

In [188]:
# Importing package pandas (For Panel Data Analysis)
import pandas as pd

# Unfolding hidden features if the cardinality is high
pd.set_option('display.max_columns', None) 

# Unfolding the max feature width for better clearity
pd.set_option('display.max_colwidth', None)                        

# Import Pandas Profiling (To generate Univariate Analysis)
from ydata_profiling import ProfileReport

# Importing package numpys (For Numerical Python)
import numpy as np

# Importing pyplot interface to use matplotlib
import matplotlib.pyplot as plt

# Importing seaborn library for interactive visualization
import seaborn as sns                                               
%matplotlib inline

# Importing library for scientific calculations
import scipy as sp

# Importing warning to disable runtime warnings
import warnings

# Warnings will appear only once
warnings.filterwarnings("ignore")                                   


# Data Acquisition & Description

In [189]:
df = pd.read_csv(filepath_or_buffer = '20240929_kaggle_Coffee Sales Dataset.csv')
print('Data Shape:', df.shape)
df.head(5)

Data Shape: (1653, 6)


Unnamed: 0,date,datetime,cash_type,card,money,coffee_name
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.7,Latte
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.7,Hot Chocolate
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.7,Hot Chocolate
3,2024-03-01,2024-03-01 13:46:33.006,card,ANON-0000-0000-0003,28.9,Americano
4,2024-03-01,2024-03-01 13:48:14.626,card,ANON-0000-0000-0004,38.7,Latte


In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1653 entries, 0 to 1652
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         1653 non-null   object 
 1   datetime     1653 non-null   object 
 2   cash_type    1653 non-null   object 
 3   card         1564 non-null   object 
 4   money        1653 non-null   float64
 5   coffee_name  1653 non-null   object 
dtypes: float64(1), object(5)
memory usage: 77.6+ KB


In [191]:
df.describe()

Unnamed: 0,money
count,1653.0
mean,31.581174
std,5.380129
min,18.12
25%,27.92
50%,32.82
75%,37.72
max,40.0


In [192]:
df.columns

Index(['date', 'datetime', 'cash_type', 'card', 'money', 'coffee_name'], dtype='object')

In [193]:
df.count()

date           1653
datetime       1653
cash_type      1653
card           1564
money          1653
coffee_name    1653
dtype: int64

In [194]:
df["card"].info

<bound method Series.info of 0       ANON-0000-0000-0001
1       ANON-0000-0000-0002
2       ANON-0000-0000-0002
3       ANON-0000-0000-0003
4       ANON-0000-0000-0004
               ...         
1648    ANON-0000-0000-0655
1649    ANON-0000-0000-0637
1650    ANON-0000-0000-0637
1651    ANON-0000-0000-0656
1652    ANON-0000-0000-0657
Name: card, Length: 1653, dtype: object>

In [195]:
df["coffee_name"].describe()

count                    1653
unique                      8
top       Americano with Milk
freq                      413
Name: coffee_name, dtype: object

In [196]:
df["coffee_name"].value_counts()

coffee_name
Americano with Milk    413
Latte                  370
Cappuccino             254
Americano              231
Cortado                169
Hot Chocolate           88
Espresso                74
Cocoa                   54
Name: count, dtype: int64

In [197]:
df["coffee_name"].nunique()

8

In [198]:
df["cash_type"].describe()

count     1653
unique       2
top       card
freq      1564
Name: cash_type, dtype: object

In [199]:
df["cash_type"].nunique()

2

In [200]:
df["cash_type"].value_counts()

cash_type
card    1564
cash      89
Name: count, dtype: int64

In [201]:
df.query('coffee_name=="Americano with Milk"')

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name
5,2024-03-01,2024-03-01 15:39:47.726,card,ANON-0000-0000-0005,33.80,Americano with Milk
7,2024-03-01,2024-03-01 18:39:03.580,card,ANON-0000-0000-0007,33.80,Americano with Milk
9,2024-03-01,2024-03-01 19:23:15.887,card,ANON-0000-0000-0008,33.80,Americano with Milk
10,2024-03-01,2024-03-01 19:29:17.391,card,ANON-0000-0000-0009,33.80,Americano with Milk
13,2024-03-02,2024-03-02 10:41:41.249,card,ANON-0000-0000-0011,33.80,Americano with Milk
...,...,...,...,...,...,...
1645,2024-09-22,2024-09-22 20:46:26.208,card,ANON-0000-0000-0455,27.92,Americano with Milk
1646,2024-09-22,2024-09-22 20:47:35.096,card,ANON-0000-0000-0455,27.92,Americano with Milk
1647,2024-09-22,2024-09-22 21:07:04.794,card,ANON-0000-0000-0655,27.92,Americano with Milk
1648,2024-09-22,2024-09-22 21:08:14.798,card,ANON-0000-0000-0655,27.92,Americano with Milk


In [202]:
# df.query('coffee_name=="Cocoa"')

## Observations

In [203]:
df["money"].describe()

count    1653.000000
mean       31.581174
std         5.380129
min        18.120000
25%        27.920000
50%        32.820000
75%        37.720000
max        40.000000
Name: money, dtype: float64

In [204]:
df["coffee_name"].value_counts()

coffee_name
Americano with Milk    413
Latte                  370
Cappuccino             254
Americano              231
Cortado                169
Hot Chocolate           88
Espresso                74
Cocoa                   54
Name: count, dtype: int64

### Feature Engineering for Time series exploration

In [205]:
copy_df=df

#### Split and Drop


In [206]:
# split_dt=pd.DataFrame(copy_df['datetime'])
# split_dt=copy_df['datetime'].str.split(' ', expand=True)
# split_dt = split_dt.reindex(columns=[0, 1]).fillna('')
# # split_dt.columns = ['split_date', 'split_time']
# split_dt.columns = ['date_s', 'time']
# copy_df = copy_df.join(split_dt)
# copy_df.drop(labels=['date_s'], axis=1, inplace=True)

In [207]:
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1653 entries, 0 to 1652
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         1653 non-null   object 
 1   datetime     1653 non-null   object 
 2   cash_type    1653 non-null   object 
 3   card         1564 non-null   object 
 4   money        1653 non-null   float64
 5   coffee_name  1653 non-null   object 
dtypes: float64(1), object(5)
memory usage: 77.6+ KB


In [208]:
copy_df.columns

Index(['date', 'datetime', 'cash_type', 'card', 'money', 'coffee_name'], dtype='object')

In [209]:
copy_df["date"]=pd.to_datetime(arg=copy_df["date"], format="%Y-%m-%d")

In [210]:
copy_df["datetime"]=pd.to_datetime(arg=copy_df["datetime"], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')

In [211]:
copy_df.head(3)

Unnamed: 0,date,datetime,cash_type,card,money,coffee_name
0,2024-03-01,2024-03-01 10:15:50.520,card,ANON-0000-0000-0001,38.7,Latte
1,2024-03-01,2024-03-01 12:19:22.539,card,ANON-0000-0000-0002,38.7,Hot Chocolate
2,2024-03-01,2024-03-01 12:20:18.089,card,ANON-0000-0000-0002,38.7,Hot Chocolate


In [212]:
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1653 entries, 0 to 1652
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         1653 non-null   datetime64[ns]
 1   datetime     1653 non-null   datetime64[ns]
 2   cash_type    1653 non-null   object        
 3   card         1564 non-null   object        
 4   money        1653 non-null   float64       
 5   coffee_name  1653 non-null   object        
dtypes: datetime64[ns](2), float64(1), object(3)
memory usage: 77.6+ KB


In [33]:
# Library to generate basic statistics about data
# !pip install ydata-profiling

In [34]:
# Upgrading pandas profiling to the latest version
# !pip install -q --upgrade ydata-profiling 