## 2 -> Data Cleaning

### Reading Data

In [20]:
import pandas as pd 

df = pd.read_excel('../data/raw/online_retail.xlsx')
cleaned_df = df.copy()

### Handling Invalid StockCode & InvoiceNo

In [21]:
cleaned_df['InvoiceNo'] = cleaned_df['InvoiceNo'].astype('str')

mask = (
  cleaned_df['InvoiceNo'].str.match('^\\d{6}$') == True
)

cleaned_df = cleaned_df[mask]

In [22]:
cleaned_df['StockCode'] = cleaned_df['StockCode'].astype('str')

mask = (
  (cleaned_df['StockCode'].str.match('^\\d{5}$') == True) |
  (cleaned_df['StockCode'].str.match('^\\d{5}[a-zA-Z]+$') == True) |
  (cleaned_df['StockCode'].str.match('^PADS$') == True)
) 

cleaned_df = cleaned_df[mask]

### Droping Tuples Without CustomerID

In [23]:
cleaned_df = cleaned_df.dropna(subset=['CustomerID'])

### Removing Cancelled Orders & Negative Quantities & Price

In [24]:
#remove cancelled orders
cleaned_df = cleaned_df[~cleaned_df['InvoiceNo'].str.startswith('C')]

#remove items with 0 or -ve quality or price
cleaned_df = cleaned_df[(cleaned_df['Quantity'] > 0) & (cleaned_df['UnitPrice'] > 0)]

### Removing 0 Prices / Free Products

In [25]:
cleaned_df = cleaned_df[cleaned_df['UnitPrice'] > 0]

In [26]:
cleaned_df['UnitPrice'].min()

0.001

In [27]:
print(f"""
      Before cleaning size -> {df.size}
      After cleaning size -> {cleaned_df.size} \n
      Percent data left -> {(len(cleaned_df)/len(df))*100}
      """)


      Before cleaning size -> 4335272
      After cleaning size -> 3170720 

      Percent data left -> 73.13774083840644
      


### Exporting Data

In [28]:
cleaned_df.to_csv('../data/processed/cleaned_data.csv', index = False)