In [2]:
import pandas as pd
import matplotlib.pyplot as plt

data_path = '../data/data.csv'

In [3]:
# Load the data
df = pd.read_csv(data_path, encoding="ISO-8859-1", dtype={'CustomerID': str,'InvoiceNo': str})
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
print(" SHAPE ".center(70,'-'))
print('Rows: {}'.format(df.shape[0]))
print('Columns: {}'.format(df.shape[1]))

print(" COLUMN NAMES & DATA TYPES ".center(70,'-'))
print(df.dtypes)

# Check for missing values
print(" MISSING VALUE PERCENTAGE ".center(70,'-'))
print(round(df.isnull().sum() / len(df) * 100, 2))

# Check for duplicates
print(" DUPLICATE VALUE PERCENTAGE ".center(70,'-'))
print("Number of duplicates:", df.duplicated().sum())
print('Percentage of duplicates:', round(df.duplicated().sum() / len(df) * 100, 2), '%')

# Remove missing values and duplicates
df = df.dropna()
df = df.drop_duplicates()

------------------------------- SHAPE --------------------------------
Rows: 541909
Columns: 8
--------------------- COLUMN NAMES & DATA TYPES ----------------------
InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object
---------------------- MISSING VALUE PERCENTAGE ----------------------
InvoiceNo       0.00
StockCode       0.00
Description     0.27
Quantity        0.00
InvoiceDate     0.00
UnitPrice       0.00
CustomerID     24.93
Country         0.00
dtype: float64
--------------------- DUPLICATE VALUE PERCENTAGE ---------------------
Number of duplicates: 5268
Percentage of duplicates: 0.97 %


In [4]:
# Save processed data
df.to_csv('../data/processed_data.csv', index=False)