# Muesli Data Analysis

## Pre-Setup

### Environment

In [157]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Read in data

In [158]:
#df_orders = pd.read_excel("./data/data/Muesli Project raw data.xlsx",sheet_name="Orders",header=1)
#df_campaign = pd.read_excel("./data/data/Muesli Project raw data.xlsx", sheet_name="Campaign Data")
#df_order_process = pd.read_excel("./data/data/Muesli Project raw data.xlsx", sheet_name="Order Process Data")
#df_interndata = pd.read_excel("./data/data/Muesli Project raw data.xlsx", sheet_name="InternData Study")

### Raw dataframes

In [159]:
df_orders.head(2)

Unnamed: 0,index,order id,order date,ship mode,customer id,customer name,origin channel,country/region,city,state,postal code,region,category,sub-category,product id,sales,quantity,discount,profit
0,27,CA-2019-121755,2019-01-16,Second Class,EH-13945,Eric Hoffmann,Email,United States,Los Angeles,California,90049.0,West,Special Projects Muesil,Gluten Free,TEC-AC-10003027,90.57,3,0.0,11.7741
1,45,CA-2019-118255,2019-03-11,First Class,ON-18715,Odella Nelson,Sales,United States,Eagan,Minnesota,55122.0,Central,Special Projects Muesil,Gluten Free,TEC-AC-10000171,45.98,2,0.0,19.7714


In [160]:
df_campaign.head(2)

Unnamed: 0,order id,arrival scan date,customer name
0,CA-2019-109666,2019-05-03,Kunst Miller
1,CA-2019-138933,2019-05-03,Jack Lebron


In [161]:
df_order_process.head(2)

Unnamed: 0,row id,order id,order date,on truck scan date,ship mode
0,3074,CA-2019-125206,2019-01-03,2019-01-07,Express
1,4919,CA-2019-160304,2019-01-02,2019-01-09,Standard Processing


In [162]:
df_interndata.head(2)

Unnamed: 0,order id,ready to ship date,pickup date
0,CA-2019-116540,2019-09-02,2019-09-03
1,CA-2019-116540,2019-09-02,2019-09-03


### Data cleaning function

In [163]:
def data_cleaning(df, df_type):
    
    # make col names lower
    df.columns = df.columns.str.lower()

    
    # orders data
    if df_type == "orders":
        # dropping cols
        df = df.drop(["index", "customer name", "origin channel", "category", "sub-category", "product id", "sales", "quantity", "discount", "profit"],axis=1)
        # dropping duplicates
        df = df.drop_duplicates()
    
    # campaign data
    elif df_type == "campaign":
        # dropping cols
        df = df.drop("customer name", axis=1)
        # dropping duplicates
        pass

    # order process data    
    elif df_type == "order_process":
        # dropping cols
        df = df.drop("row id", axis=1)
        # dropping duplicates
        df = df.drop_duplicates()
        # dropping 1 duplicate row for id (scanned on truck twice)
        df = df.drop_duplicates(subset=["order id"], keep = "first")
        # dropping column order date, because 100% match with order date in orders data
        df = df.drop("order date", axis=1)
        # dropping column ship mode, because 100% match with ship mode in orders data (assumption: second class shipping = standard)
        df = df.drop("ship mode", axis=1)

    #intern data
    else:
        # dropping cols
        pass
        # dropping duplicates
        df = df.drop_duplicates()
        # dropping column pickup date, because 100% match with on truck scan date
        df = df.drop("pickup date", axis=1)

    

    return df

### Checks for cleaning function

#### Orders data

In [164]:
df_orders_1 = data_cleaning(df_orders,"orders")
df_orders_1

Unnamed: 0,order id,order date,ship mode,customer id,country/region,city,state,postal code,region
0,CA-2019-121755,2019-01-16,Second Class,EH-13945,United States,Los Angeles,California,90049.0,West
1,CA-2019-118255,2019-03-11,First Class,ON-18715,United States,Eagan,Minnesota,55122.0,Central
2,CA-2019-169194,2019-06-20,Standard Class,LH-16900,United States,Dover,Delaware,19901.0,East
3,CA-2019-111682,2019-06-17,First Class,TB-21055,United States,Troy,New York,12180.0,East
4,CA-2018-135545,2018-11-24,Standard Class,KM-16720,United States,Los Angeles,California,90004.0,West
...,...,...,...,...,...,...,...,...,...
9979,CA-2019-146913,2019-10-31,Standard Class,SF-20965,United States,San Francisco,California,94109.0,West
9980,US-2017-114377,2017-11-05,First Class,BG-11035,United States,Hampton,Virginia,23666.0,South
9984,CA-2020-107209,2020-07-27,Second Class,JW-15955,United States,Raleigh,North Carolina,27604.0,South
9986,US-2020-152842,2020-07-16,Standard Class,NF-18385,United States,Charlotte,North Carolina,28205.0,South


In [165]:
df_orders_1.duplicated().value_counts()

False    5009
Name: count, dtype: int64

In [166]:
df_orders_1["order id"].nunique()

5009

In [167]:
df_orders_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5009 entries, 0 to 9988
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   order id        5009 non-null   object        
 1   order date      5009 non-null   datetime64[ns]
 2   ship mode       5009 non-null   object        
 3   customer id     5009 non-null   object        
 4   country/region  5009 non-null   object        
 5   city            5009 non-null   object        
 6   state           5009 non-null   object        
 7   postal code     5003 non-null   float64       
 8   region          5009 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 391.3+ KB


#### Campaign data

In [168]:
df_campaign_1 = data_cleaning(df_campaign,"campaign")
df_campaign_1

Unnamed: 0,order id,arrival scan date
0,CA-2019-109666,2019-05-03
1,CA-2019-138933,2019-05-03
2,CA-2019-130001,2019-05-03
3,CA-2019-113061,2019-05-06
4,CA-2019-162138,2019-05-06
...,...,...
328,CA-2020-129707,2020-05-08
329,CA-2020-125381,2020-05-08
330,CA-2020-141733,2020-05-15
331,US-2020-104451,2020-05-15


In [169]:
df_campaign_1.duplicated().value_counts()

False    333
Name: count, dtype: int64

In [170]:
df_campaign_1["order id"].nunique()

333

In [171]:
df_campaign_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   order id           333 non-null    object        
 1   arrival scan date  333 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 5.3+ KB


#### Order process data

In [172]:
df_order_process_1 = data_cleaning(df_order_process,"order_process")
df_order_process_1

Unnamed: 0,order id,on truck scan date
0,CA-2019-125206,2019-01-07
1,CA-2019-160304,2019-01-09
3,US-2019-116365,2019-01-09
6,CA-2019-105207,2019-01-09
8,CA-2019-158211,2019-01-09
...,...,...
5890,CA-2020-130631,2021-01-06
5892,CA-2020-126221,2021-01-06
5893,CA-2020-143259,2021-01-06
5896,CA-2020-115427,2021-01-06


In [173]:
df_order_process_1.duplicated().value_counts()

False    3002
Name: count, dtype: int64

In [174]:
df_order_process_1["order id"].duplicated().value_counts()

order id
False    3002
Name: count, dtype: int64

In [175]:
df_order_process_1.drop_duplicates("order id")


Unnamed: 0,order id,on truck scan date
0,CA-2019-125206,2019-01-07
1,CA-2019-160304,2019-01-09
3,US-2019-116365,2019-01-09
6,CA-2019-105207,2019-01-09
8,CA-2019-158211,2019-01-09
...,...,...
5890,CA-2020-130631,2021-01-06
5892,CA-2020-126221,2021-01-06
5893,CA-2020-143259,2021-01-06
5896,CA-2020-115427,2021-01-06


In [176]:
duplicates = df_order_process_1[df_order_process_1["order id"].duplicated(keep=False)]
duplicates

Unnamed: 0,order id,on truck scan date


In [177]:
df_order_process_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3002 entries, 0 to 5898
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order id            3002 non-null   object        
 1   on truck scan date  3002 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 70.4+ KB


#### Intern data

In [178]:
df_interndata_1 = data_cleaning(df_interndata,"intern")
df_interndata_1

Unnamed: 0,order id,ready to ship date
0,CA-2019-116540,2019-09-02
2,CA-2019-129847,2019-09-04
3,CA-2019-129630,2019-09-04
4,CA-2019-106278,2019-09-05
5,CA-2019-158099,2019-09-05
...,...,...
283,US-2020-165456,2020-12-03
284,US-2020-110576,2020-12-04
285,CA-2020-105333,2020-12-04
288,CA-2020-119305,2020-12-04


In [179]:
df_interndata_1.duplicated().value_counts()

False    204
Name: count, dtype: int64

In [180]:
df_interndata_1["order id"].duplicated().value_counts()

order id
False    204
Name: count, dtype: int64

In [181]:
df_interndata_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 204 entries, 0 to 289
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order id            204 non-null    object        
 1   ready to ship date  204 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 4.8+ KB


#### Truck scan vs intern scan

In [182]:
#merged_truck = df_order_process_1.merge(df_interndata_1, on="order id", how="outer")
#merged_truck = merged_truck[["order id","on truck scan date","pickup date"]].dropna()
#merged_truck

In [183]:
#merged_truck["diff"] = merged_truck["on truck scan date"] - merged_truck["pickup date"]
#merged_truck

#### Order dates & shipping methods

In [184]:
#merged_op = df_orders_1.merge(df_order_process_1, on="order id", how="outer").dropna()

In [185]:
#merged_op[["order id","order date_x","ship mode_x","order date_y","ship mode_y"]]
#merged_op["date_diff"] = merged_op["order date_x"]-merged_op["order date_y"]
#merged_op["date_diff"].value_counts()

In [186]:
#merged_op

In [187]:
#merged_op[["order id","ship mode_x","ship mode_y"]]

In [188]:
#merged_op["ship mode_x"].value_counts()

In [189]:
#merged_op["ship mode_y"].value_counts()

In [190]:
#merged_op["ship mode_x"] = merged_op["ship mode_x"].str.replace(" Class","")
#merged_op["ship mode_y"] = merged_op["ship mode_y"].str.replace(" Processing","")
#merged_op["ship mode_x"] = merged_op["ship mode_x"].str.replace("Second","Standard")
#merged_op["ship mode_x"] = merged_op["ship mode_x"].str.replace("First","Express")

In [191]:
#merged_op[["order id","ship mode_x","ship mode_y"]]

In [192]:
#merged_op['match'] = merged_op['ship mode_x'] == merged_op['ship mode_y']
#merged_op['match'].value_counts()

## Cleaned dataframes

In [193]:
df_orders_cleaned = data_cleaning(df_orders,"orders")
df_campaign_cleaned = data_cleaning(df_campaign,"campaign")
df_order_process_cleaned = data_cleaning(df_order_process,"order_process")
df_interndata_cleaned = data_cleaning(df_interndata,"intern")

In [194]:
df_orders_cleaned.head(2)

Unnamed: 0,order id,order date,ship mode,customer id,country/region,city,state,postal code,region
0,CA-2019-121755,2019-01-16,Second Class,EH-13945,United States,Los Angeles,California,90049.0,West
1,CA-2019-118255,2019-03-11,First Class,ON-18715,United States,Eagan,Minnesota,55122.0,Central


In [195]:
df_campaign_cleaned.head(2)

Unnamed: 0,order id,arrival scan date
0,CA-2019-109666,2019-05-03
1,CA-2019-138933,2019-05-03


In [196]:
df_order_process_cleaned.head(2)

Unnamed: 0,order id,on truck scan date
0,CA-2019-125206,2019-01-07
1,CA-2019-160304,2019-01-09


In [197]:
df_interndata_cleaned.head(2)

Unnamed: 0,order id,ready to ship date
0,CA-2019-116540,2019-09-02
2,CA-2019-129847,2019-09-04


## Metrics

In [None]:
#Metrics to define
#o_date_2_processed : Jamil
#o_processes_2_truck: Janina
#o_truck_2_delivered : Jing
#o_date_2_delivered
#o_processed_2_delivered

# express_del = True/False
# weekday = 1-7 
# weekday_list= [monday, tuesday, wednesday, thursday, friday, saturday, sunday]