# Cleaning: Recategorizing
---

In [1]:
# dependencies
import pandas as pd

In [2]:
# import amazon sale report

# set up file path
file_path = "../Data/Amazon Sale Report.csv"

# import csv file path (dropping last unnamed col)
amazon_sale_df = pd.read_csv(file_path, usecols=lambda column: column not in ['index','Unnamed: 22'])

# display df
amazon_sale_df.head()

Unnamed: 0,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,Size,...,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,promotion-ids,B2B,fulfilled-by
0,405-8078784-5731545,04-30-22,Cancelled,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,S,...,0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,,False,Easy Ship
1,171-9198151-1101146,04-30-22,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,3XL,...,1,INR,406.0,BENGALURU,KARNATAKA,560085.0,IN,Amazon PLCC Free-Financing Universal Merchant ...,False,Easy Ship
2,404-0687676-7273146,04-30-22,Shipped,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,kurta,XL,...,1,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,True,
3,403-9615377-8133951,04-30-22,Cancelled,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,L,...,0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,,False,Easy Ship
4,407-1069790-7240320,04-30-22,Shipped,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,3XL,...,1,INR,574.0,CHENNAI,TAMIL NADU,600073.0,IN,,False,


## Filter for only business to consumer sales

In [3]:
# filter out B2B sales
consumer_sale_df = amazon_sale_df[amazon_sale_df['B2B'] == False]

# drop B2B column
consumer_sale_df = consumer_sale_df.drop(columns='B2B')

# display consumer df
consumer_sale_df

Unnamed: 0,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,Size,...,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,promotion-ids,fulfilled-by
0,405-8078784-5731545,04-30-22,Cancelled,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,S,...,,0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,,Easy Ship
1,171-9198151-1101146,04-30-22,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,3XL,...,Shipped,1,INR,406.00,BENGALURU,KARNATAKA,560085.0,IN,Amazon PLCC Free-Financing Universal Merchant ...,Easy Ship
3,403-9615377-8133951,04-30-22,Cancelled,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,L,...,,0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,,Easy Ship
4,407-1069790-7240320,04-30-22,Shipped,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,3XL,...,Shipped,1,INR,574.00,CHENNAI,TAMIL NADU,600073.0,IN,,
5,404-1490984-4578765,04-30-22,Shipped,Amazon,Amazon.in,Expedited,SET264,SET264-KR-NP-XL,Set,XL,...,Shipped,1,INR,824.00,GHAZIABAD,UTTAR PRADESH,201102.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128970,406-6001380-7673107,05-31-22,Shipped,Amazon,Amazon.in,Expedited,JNE3697,JNE3697-KR-XL,kurta,XL,...,Shipped,1,INR,517.00,HYDERABAD,TELANGANA,500013.0,IN,,
128971,402-9551604-7544318,05-31-22,Shipped,Amazon,Amazon.in,Expedited,SET401,SET401-KR-NP-M,Set,M,...,Shipped,1,INR,999.00,GURUGRAM,HARYANA,122004.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,
128972,407-9547469-3152358,05-31-22,Shipped,Amazon,Amazon.in,Expedited,J0157,J0157-DR-XXL,Western Dress,XXL,...,Shipped,1,INR,690.00,HYDERABAD,TELANGANA,500049.0,IN,,
128973,402-6184140-0545956,05-31-22,Shipped,Amazon,Amazon.in,Expedited,J0012,J0012-SKD-XS,Set,XS,...,Shipped,1,INR,1199.00,Halol,Gujarat,389350.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,


## Recategorize *Status* column values and create new successful status column, *Status (Successful)*

#### main categories in Status --> Status (Successful)
- Cancelled--> False
- RTS (returned to seller)--> False
- Delivered (Successful)--> True

In [4]:
# look into status column
consumer_sale_df['Status'].value_counts()

Status
Shipped                          77273
Shipped - Delivered to Buyer     28520
Cancelled                        18259
Shipped - Returned to Seller      1944
Shipped - Picked Up                968
Pending                            654
Pending - Waiting for Pick Up      281
Shipped - Returning to Seller      145
Shipped - Out for Delivery          35
Shipped - Rejected by Buyer         11
Shipping                             8
Shipped - Lost in Transit            5
Shipped - Damaged                    1
Name: count, dtype: int64

In [5]:
# rename Status columns to three categories and drop values that don't fit
    # Cancelled
    # RTS (Return to Seller): "Shipped - Returned to Seller", "Shipped - Returning to Seller", "Shipped - Rejected by Buyer"
    # Delivered: "Shipped - Delivered to Buyer", "Shipped - Picked Up", --> MAYBE ADD: "Pending - Waiting for Pick Up"
consumer_sale_df.loc[:, 'Status'] = consumer_sale_df['Status'].replace({
    "Shipped": None,
    "Shipped - Delivered to Buyer": "Delivered",
    "Shipped - Returned to Seller": "RTS (Return to Seller)",
    "Shipped - Picked Up": "Delivered",
    "Pending": None,
    "Pending - Waiting for Pick Up": None,
    "Shipped - Returning to Seller": "RTS (Return to Seller)",
    "Shipped - Out for Delivery": None,
    "Shipped - Rejected by Buyer": "RTS (Return to Seller)",
    "Shipping": None,
    "Shipped - Lost in Transit": None,
    "Shipped - Damaged": None
})

# drop rows with None value
consumer_sale_df = consumer_sale_df.dropna(subset=['Status'])

# display new count values for Status column
consumer_sale_df['Status'].value_counts()

Status
Delivered                 29488
Cancelled                 18259
RTS (Return to Seller)     2100
Name: count, dtype: int64

In [6]:
# create a new boolean column to based on delivery success (delivered is successful) and add it right after status
consumer_sale_df.insert(3, 'Status (Successful)', consumer_sale_df['Status'] == 'Delivered')

# display new df
consumer_sale_df

Unnamed: 0,Order ID,Date,Status,Status (Successful),Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,...,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,promotion-ids,fulfilled-by
0,405-8078784-5731545,04-30-22,Cancelled,False,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,...,,0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,,Easy Ship
1,171-9198151-1101146,04-30-22,Delivered,True,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,...,Shipped,1,INR,406.00,BENGALURU,KARNATAKA,560085.0,IN,Amazon PLCC Free-Financing Universal Merchant ...,Easy Ship
3,403-9615377-8133951,04-30-22,Cancelled,False,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,...,,0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,,Easy Ship
7,406-7807733-3785945,04-30-22,Delivered,True,Merchant,Amazon.in,Standard,JNE3405,JNE3405-KR-S,kurta,...,Shipped,1,INR,399.00,HYDERABAD,TELANGANA,500032.0,IN,Amazon PLCC Free-Financing Universal Merchant ...,Easy Ship
8,407-5443024-5233168,04-30-22,Cancelled,False,Amazon,Amazon.in,Expedited,SET200,SET200-KR-NP-A-XXXL,Set,...,Cancelled,0,,,HYDERABAD,TELANGANA,500008.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128961,402-0082204-6323568,05-31-22,Cancelled,False,Amazon,Amazon.in,Expedited,JNE3797,JNE3797-KR-A-XL,Western Dress,...,Unshipped,1,INR,771.00,Junagadh,Gujarat,362001.0,IN,,
128962,408-9803724-6565965,05-31-22,Cancelled,False,Amazon,Amazon.in,Expedited,MEN5024,MEN5024-KR-L,kurta,...,Unshipped,1,INR,665.00,MUMBAI,MAHARASHTRA,400056.0,IN,,
128965,408-5154281-4593912,05-31-22,Cancelled,False,Amazon,Amazon.in,Expedited,J0119,J0119-TP-XXXL,Top,...,Unshipped,1,INR,574.00,Prayagraj (ALLAHABAD),UTTAR PRADESH,211007.0,IN,,
128967,404-5182288-1653947,05-31-22,Cancelled,False,Amazon,Amazon.in,Expedited,JNE3638,JNE3638-KR-XS,kurta,...,Cancelled,0,,,Kolkata,WEST BENGAL,700040.0,IN,,


## Recategorize *Category* column values

### updated categories:
- Set
- Kurta
- Western Dress
- Non-Western Dress
- Top
- Bottom
- Saree

In [7]:
# looking into category column values
consumer_sale_df['Category'].value_counts()

Category
Set              19101
kurta            18632
Western Dress     7781
Top               3384
Ethnic Dress       393
Blouse             298
Bottom             213
Saree               45
Name: count, dtype: int64

In [8]:
# update the following categories naming
consumer_sale_df.loc[:, 'Category'] = consumer_sale_df['Category'].replace({
    "kurta": "Kurta",
    "Ethnic Dress": "Non-Western Dress",
    "Blouse": "Top"
})

# check value counts
consumer_sale_df['Category'].value_counts()

Category
Set                  19101
Kurta                18632
Western Dress         7781
Top                   3682
Non-Western Dress      393
Bottom                 213
Saree                   45
Name: count, dtype: int64

# Cleaning: Dropping Repetitive Columns
---

In [9]:
# keep rows where 'Sales Channel ' is Amazon
consumer_sale_df = consumer_sale_df[consumer_sale_df.iloc[:,5] != "Non-Amazon"]

In [10]:
# looking into columns unique value counts
consumer_sale_df.nunique()

Order ID               46367
Date                      91
Status                     3
Status (Successful)        2
Fulfilment                 2
Sales Channel              1
ship-service-level         2
Style                   1185
SKU                     5595
Category                   7
Size                      11
ASIN                    5594
Courier Status             3
Qty                        6
currency                   1
Amount                  1280
ship-city               5502
ship-state                60
ship-postal-code        7014
ship-country               1
promotion-ids           5561
fulfilled-by               1
dtype: int64

In [11]:
# get list of repetitive columns with only one value (other than null)
repetitive_cols = [col for col in consumer_sale_df.columns if consumer_sale_df[col].nunique() == 1]

# drop repetitve columns: 'Sales Channel ', 'currency', 'ship-country', 'fulfilled-by'
consumer_sale_df = consumer_sale_df.drop(columns=repetitive_cols)

In [12]:
# drop 'Courier Status' column (not helpful compared to 'Status')
consumer_sale_df = consumer_sale_df.drop('Courier Status', axis=1)

In [13]:
# display df
consumer_sale_df

Unnamed: 0,Order ID,Date,Status,Status (Successful),Fulfilment,ship-service-level,Style,SKU,Category,Size,ASIN,Qty,Amount,ship-city,ship-state,ship-postal-code,promotion-ids
0,405-8078784-5731545,04-30-22,Cancelled,False,Merchant,Standard,SET389,SET389-KR-NP-S,Set,S,B09KXVBD7Z,0,647.62,MUMBAI,MAHARASHTRA,400081.0,
1,171-9198151-1101146,04-30-22,Delivered,True,Merchant,Standard,JNE3781,JNE3781-KR-XXXL,Kurta,3XL,B09K3WFS32,1,406.00,BENGALURU,KARNATAKA,560085.0,Amazon PLCC Free-Financing Universal Merchant ...
3,403-9615377-8133951,04-30-22,Cancelled,False,Merchant,Standard,J0341,J0341-DR-L,Western Dress,L,B099NRCT7B,0,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,
7,406-7807733-3785945,04-30-22,Delivered,True,Merchant,Standard,JNE3405,JNE3405-KR-S,Kurta,S,B081WX4G4Q,1,399.00,HYDERABAD,TELANGANA,500032.0,Amazon PLCC Free-Financing Universal Merchant ...
8,407-5443024-5233168,04-30-22,Cancelled,False,Amazon,Expedited,SET200,SET200-KR-NP-A-XXXL,Set,3XL,B08L91ZZXN,0,,HYDERABAD,TELANGANA,500008.0,IN Core Free Shipping 2015/04/08 23-48-5-108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128961,402-0082204-6323568,05-31-22,Cancelled,False,Amazon,Expedited,JNE3797,JNE3797-KR-A-XL,Western Dress,XL,B09TH3H2DP,1,771.00,Junagadh,Gujarat,362001.0,
128962,408-9803724-6565965,05-31-22,Cancelled,False,Amazon,Expedited,MEN5024,MEN5024-KR-L,Kurta,L,B08YYSQJTC,1,665.00,MUMBAI,MAHARASHTRA,400056.0,
128965,408-5154281-4593912,05-31-22,Cancelled,False,Amazon,Expedited,J0119,J0119-TP-XXXL,Top,3XL,B08RYPRVPV,1,574.00,Prayagraj (ALLAHABAD),UTTAR PRADESH,211007.0,
128967,404-5182288-1653947,05-31-22,Cancelled,False,Amazon,Expedited,JNE3638,JNE3638-KR-XS,Kurta,XS,B09814Q3QH,0,,Kolkata,WEST BENGAL,700040.0,


## Updating Data Format
---

In [14]:
# check data types
consumer_sale_df.dtypes

Order ID                object
Date                    object
Status                  object
Status (Successful)       bool
Fulfilment              object
ship-service-level      object
Style                   object
SKU                     object
Category                object
Size                    object
ASIN                    object
Qty                      int64
Amount                 float64
ship-city               object
ship-state              object
ship-postal-code       float64
promotion-ids           object
dtype: object

#### Updating Date column to datetime

In [15]:
# convert date column from object to datetime data type
consumer_sale_df['Date'] = pd.to_datetime(consumer_sale_df['Date'], format='%m-%d-%y')

#### Updating postal code to integer

In [16]:
# convert ship-postal-code column from float to integer data type
consumer_sale_df['ship-postal-code'] = consumer_sale_df['ship-postal-code'].astype('Int64')

In [17]:
# check updated data types
consumer_sale_df.dtypes

Order ID                       object
Date                   datetime64[ns]
Status                         object
Status (Successful)              bool
Fulfilment                     object
ship-service-level             object
Style                          object
SKU                            object
Category                       object
Size                           object
ASIN                           object
Qty                             int64
Amount                        float64
ship-city                      object
ship-state                     object
ship-postal-code                Int64
promotion-ids                  object
dtype: object

#### Update Ship State

In [18]:
# see current values
consumer_sale_df['ship-state'].value_counts()

ship-state
MAHARASHTRA               8234
KARNATAKA                 6147
UTTAR PRADESH             4315
TELANGANA                 4266
TAMIL NADU                4228
KERALA                    3010
DELHI                     2551
WEST BENGAL               2255
ANDHRA PRADESH            2218
Gujarat                   1762
HARYANA                   1614
RAJASTHAN                 1115
MADHYA PRADESH            1062
BIHAR                      885
ODISHA                     846
PUNJAB                     733
ASSAM                      703
UTTARAKHAND                666
JHARKHAND                  616
GOA                        408
CHHATTISGARH               385
HIMACHAL PRADESH           370
JAMMU & KASHMIR            309
CHANDIGARH                 129
MANIPUR                    129
PUDUCHERRY                 126
ANDAMAN & NICOBAR          114
MEGHALAYA                   87
SIKKIM                      81
NAGALAND                    77
ARUNACHAL PRADESH           67
TRIPURA                     

In [19]:
# capitalize all state names
consumer_sale_df.loc[:, 'ship-state'] = consumer_sale_df['ship-state'].str.strip().str.upper()

In [20]:
# capitalize all state names
consumer_sale_df['ship-state'].value_counts()

ship-state
MAHARASHTRA               8234
KARNATAKA                 6147
UTTAR PRADESH             4315
TELANGANA                 4266
TAMIL NADU                4228
KERALA                    3010
DELHI                     2614
WEST BENGAL               2255
ANDHRA PRADESH            2218
GUJARAT                   1762
HARYANA                   1614
RAJASTHAN                 1135
MADHYA PRADESH            1062
BIHAR                      893
ODISHA                     855
PUNJAB                     755
ASSAM                      703
UTTARAKHAND                666
JHARKHAND                  616
GOA                        419
CHHATTISGARH               385
HIMACHAL PRADESH           370
JAMMU & KASHMIR            309
CHANDIGARH                 133
MANIPUR                    131
PUDUCHERRY                 126
ANDAMAN & NICOBAR          114
MEGHALAYA                   90
SIKKIM                      82
NAGALAND                    77
ARUNACHAL PRADESH           70
TRIPURA                     

#### Update Ship City

In [21]:
# see current values
consumer_sale_df['ship-city'].nunique()

5502

In [22]:
# capitalize all city names
consumer_sale_df.loc[:, 'ship-city'] = consumer_sale_df['ship-city'].str.strip().str.upper()

In [23]:
# see current values
consumer_sale_df['ship-city'].nunique()

4565

## Cleaning: Dropping Irrelevant Dates
---

In [24]:
# find max and min dates
print(f"Min Date: {consumer_sale_df['Date'].min()}")
print(f"Max Date: {consumer_sale_df['Date'].max()}")

Min Date: 2022-03-31 00:00:00
Max Date: 2022-06-29 00:00:00


In [25]:
# drop March values (just last day of March (outlier) with only few rows of data in March)
consumer_sale_df = consumer_sale_df[consumer_sale_df['Date'] != "2022-03-31"]

# updated min and max dates
print(f"Min Date: {consumer_sale_df['Date'].min()}")
print(f"Max Date: {consumer_sale_df['Date'].max()}")

Min Date: 2022-04-01 00:00:00
Max Date: 2022-06-29 00:00:00


## Export updated Amazon Sale Report.csv
---

In [26]:
# display current df
consumer_sale_df

Unnamed: 0,Order ID,Date,Status,Status (Successful),Fulfilment,ship-service-level,Style,SKU,Category,Size,ASIN,Qty,Amount,ship-city,ship-state,ship-postal-code,promotion-ids
0,405-8078784-5731545,2022-04-30,Cancelled,False,Merchant,Standard,SET389,SET389-KR-NP-S,Set,S,B09KXVBD7Z,0,647.62,MUMBAI,MAHARASHTRA,400081,
1,171-9198151-1101146,2022-04-30,Delivered,True,Merchant,Standard,JNE3781,JNE3781-KR-XXXL,Kurta,3XL,B09K3WFS32,1,406.00,BENGALURU,KARNATAKA,560085,Amazon PLCC Free-Financing Universal Merchant ...
3,403-9615377-8133951,2022-04-30,Cancelled,False,Merchant,Standard,J0341,J0341-DR-L,Western Dress,L,B099NRCT7B,0,753.33,PUDUCHERRY,PUDUCHERRY,605008,
7,406-7807733-3785945,2022-04-30,Delivered,True,Merchant,Standard,JNE3405,JNE3405-KR-S,Kurta,S,B081WX4G4Q,1,399.00,HYDERABAD,TELANGANA,500032,Amazon PLCC Free-Financing Universal Merchant ...
8,407-5443024-5233168,2022-04-30,Cancelled,False,Amazon,Expedited,SET200,SET200-KR-NP-A-XXXL,Set,3XL,B08L91ZZXN,0,,HYDERABAD,TELANGANA,500008,IN Core Free Shipping 2015/04/08 23-48-5-108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128961,402-0082204-6323568,2022-05-31,Cancelled,False,Amazon,Expedited,JNE3797,JNE3797-KR-A-XL,Western Dress,XL,B09TH3H2DP,1,771.00,JUNAGADH,GUJARAT,362001,
128962,408-9803724-6565965,2022-05-31,Cancelled,False,Amazon,Expedited,MEN5024,MEN5024-KR-L,Kurta,L,B08YYSQJTC,1,665.00,MUMBAI,MAHARASHTRA,400056,
128965,408-5154281-4593912,2022-05-31,Cancelled,False,Amazon,Expedited,J0119,J0119-TP-XXXL,Top,3XL,B08RYPRVPV,1,574.00,PRAYAGRAJ (ALLAHABAD),UTTAR PRADESH,211007,
128967,404-5182288-1653947,2022-05-31,Cancelled,False,Amazon,Expedited,JNE3638,JNE3638-KR-XS,Kurta,XS,B09814Q3QH,0,,KOLKATA,WEST BENGAL,700040,


In [27]:
# export updated csv to output folder
consumer_sale_df.to_csv("../output/cleaned_Amazon Sale Report.csv", index=False)