In [None]:
# Loading the saved CSV files from data.ipynb into pandas dataframes
import pandas as pd
raw_data_path = 'data/raw_data.csv'
incremental_data_path = 'data/incremental_data.csv'
df_raw = pd.read_csv(raw_data_path, encoding='ISO-8859-1')
df_incremental = pd.read_csv(incremental_data_path, encoding='ISO-8859-1')
df_raw.head(), df_incremental.head()

# Display head, info and describe of the raw_data and incremental_data dataframes
df_raw.info(), df_raw.describe(), df_incremental.info(), df_incremental.describe()

# Identifying and discussine various quality data issues in raw_data and incremental_data dataframes
# 1. Missing Values
missing_values_raw = df_raw.isnull().sum()
missing_values_incremental = df_incremental.isnull().sum()
print("Missing Values in Raw Data:\n", missing_values_raw)
print("Missing Values in Incremental Data:\n", missing_values_incremental)

# 2. Duplicates
duplicates_raw = df_raw.duplicated().sum()
duplicates_incremental = df_incremental.duplicated().sum()
print("Duplicates in Raw Data:", duplicates_raw)
print("Duplicates in Incremental Data:", duplicates_incremental)

# 3. Inconsistent Data Types
data_types_raw = df_raw.dtypes
data_types_incremental = df_incremental.dtypes
print("Data Types in Raw Data:\n", data_types_raw)
print("Data Types in Incremental Data:\n", data_types_incremental)

# 4. Outliers
import numpy as np
numeric_cols_raw = df_raw.select_dtypes(include=[np.number]).columns
outliers_raw = {col: df_raw[(df_raw[col] < df_raw[col].quantile(0.01)) | (df_raw[col] > df_raw[col].quantile(0.99))] for col in numeric_cols_raw}
numeric_cols_incremental = df_incremental.select_dtypes(include=[np.number]).columns
outliers_incremental = {col: df_incremental[(df_incremental[col] < df_incremental[col].quantile(0.01)) | (df_incremental[col] > df_incremental[col].quantile(0.99))] for col in numeric_cols_incremental}
print("Outliers in Raw Data:\n", outliers_raw)
print("Outliers in Incremental Data:\n", outliers_incremental)

# Summarizing the findings
print("Summary of Data Quality Issues:")
print("Raw Data - Missing Values:\n", missing_values_raw[missing_values_raw > 0])
print("Raw Data - Duplicates:", duplicates_raw)
print("Incremental Data - Missing Values:\n", missing_values_incremental[missing_values_incremental > 0])
print("Incremental Data - Duplicates:", duplicates_incremental)

# Cleaning the raw_data and incremental_data dataframes based on the identified quality issues
# 1. Handling Missing Values - Filling with mean for numeric columns and mode for categorical columns
for col in df_raw.columns:
    if df_raw[col].dtype in [np.float64, np.int64]:
        df_raw[col].fillna(df_raw[col].mean(), inplace=True)
    else:
        df_raw[col].fillna(df_raw[col].mode()[0], inplace=True)
for col in df_incremental.columns:
    if df_incremental[col].dtype in [np.float64, np.int64]:
        df_incremental[col].fillna(df_incremental[col].mean(), inplace=True)
    else:
        df_incremental[col].fillna(df_incremental[col].mode()[0], inplace=True)
# 2. Removing Duplicates
df_raw.drop_duplicates(inplace=True)
df_incremental.drop_duplicates(inplace=True)
# 3. Ensuring Consistent Data Types
for col in df_raw.columns:
    if col in df_incremental.columns:
        df_incremental[col] = df_incremental[col].astype(df_raw[col].dtype)
# 4. Handling Outliers - Capping at 1st and 99th percentiles
for col in numeric_cols_raw:
    lower_bound = df_raw[col].quantile(0.01)
    upper_bound = df_raw[col].quantile(0.99)
    df_raw[col] = np.where(df_raw[col] < lower_bound, lower_bound, df_raw[col])
    df_raw[col] = np.where(df_raw[col] > upper_bound, upper_bound, df_raw[col])
for col in numeric_cols_incremental:
    lower_bound = df_incremental[col].quantile(0.01)
    upper_bound = df_incremental[col].quantile(0.99)
    df_incremental[col] = np.where(df_incremental[col] < lower_bound, lower_bound, df_incremental[col])
    df_incremental[col] = np.where(df_incremental[col] > upper_bound, upper_bound, df_incremental[col])
# Verifying the cleaning process
df_raw.info(), df_incremental.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89884 entries, 0 to 89883
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    89884 non-null  object 
 1   StockCode    89884 non-null  object 
 2   Description  89803 non-null  object 
 3   Quantity     89884 non-null  int64  
 4   InvoiceDate  89884 non-null  obj

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_raw[col].fillna(df_raw[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_raw[col].fillna(df_raw[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are

<class 'pandas.core.frame.DataFrame'>
Index: 536641 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    536641 non-null  object 
 1   StockCode    536641 non-null  object 
 2   Description  536641 non-null  object 
 3   Quantity     536641 non-null  float64
 4   InvoiceDate  536641 non-null  object 
 5   UnitPrice    536641 non-null  float64
 6   CustomerID   536641 non-null  float64
 7   Country      536641 non-null  object 
dtypes: float64(3), object(5)
memory usage: 36.8+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 88567 entries, 0 to 89883
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    88567 non-null  object 
 1   StockCode    88567 non-null  object 
 2   Description  88567 non-null  object 
 3   Quantity     88567 non-null  float64
 4   InvoiceDate  88567 non-null  object 
 5   UnitPrice 

(None, None)

In [13]:
# Merging the cleaned raw_data and cleaned_incremental_data dataframes into a single dataframe if relevant (e.g., append new records from incremental)
df_merged = pd.concat([df_raw, df_incremental], ignore_index=True)
df_merged.info()
# Saving the cleaned and merged dataframe to ET_Exam_Allan_095/data directory
merged_data_path = 'data/merged_cleaned_data.csv'
df_merged.to_csv(merged_data_path, index=False, encoding='ISO-8859-1')
print("Merged cleaned dataframe saved to:", merged_data_path)

# Creating cleaned folder for the cleaned dataframes and saving it as cleaned_full and cleaned_incremental in ET_Exam_Allan_095/data directory
import os  
cleaned_data_path = 'data/cleaned'
os.makedirs(cleaned_data_path, exist_ok=True)
cleaned_full_path = os.path.join(cleaned_data_path, 'cleaned_full.csv')
cleaned_incremental_path = os.path.join(cleaned_data_path, 'cleaned_incremental.csv')
df_raw.to_csv(cleaned_full_path, index=False, encoding='ISO-8859-1')
df_incremental.to_csv(cleaned_incremental_path, index=False, encoding='ISO-8859-1')
print("Cleaned dataframes saved to:", cleaned_full_path, "and", cleaned_incremental_path)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625208 entries, 0 to 625207
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    625208 non-null  object 
 1   StockCode    625208 non-null  object 
 2   Description  625208 non-null  object 
 3   Quantity     625208 non-null  float64
 4   InvoiceDate  625208 non-null  object 
 5   UnitPrice    625208 non-null  float64
 6   CustomerID   625208 non-null  float64
 7   Country      625208 non-null  object 
dtypes: float64(3), object(5)
memory usage: 38.2+ MB
Merged cleaned dataframe saved to: data/merged_cleaned_data.csv
Cleaned dataframes saved to: data/cleaned\cleaned_full.csv and data/cleaned\cleaned_incremental.csv
