# Imports

In [107]:
import pandas as pd
from datetime import datetime

# Parameters
The following cell contains variables that are used to drive execution and improve reusability with future data sets

In [108]:
# File Location
strFilePath = "../Data/"
strFileName = "Project_1_Data_File.csv"

# Columns to treat as dates when importing
lstDateColNames = ["Date"]

# Columns to treat as doubles when importing
lstDoubleColNames = ["Transaction Amount"]

# Columns to standardize values
dictColMappingData = {
    "Payment Method": {"Cash":"Cash",
                       "CC":"Credit Card",
                       "PayPal":"PayPal",
                       "PP":"PayPal",
                       "Credit Card":"Credit Card"}
}

In [109]:
print(dictColMappingData)

{'Payment Method': {'Cash': 'Cash', 'CC': 'Credit Card', 'PayPal': 'PayPal', 'PP': 'PayPal', 'Credit Card': 'Credit Card'}}


# Step 1
Data Ingestion
- Choose the correct source for the provided sample dataset
- Write code to fetch and load the raw data into a data structure

In [110]:
# Append the file path and file name into a single variable, ensuring we handle the case where the user did not finish the path with a slash
strFullFilePath = strFilePath + ("" if strFilePath[-1:] == "/" else "/") + strFileName

dfImport = pd.read_csv(strFullFilePath, dtype_backend="pyarrow")

# Step 2
Data Assessment and Exploration
- Examine the dataset to identify missing values, outliers, and data quality issues
- To understand the data, generate summary statistics (mean, median, mode, etc)

In [160]:
# Ensure we can look at the full dataset
#pd.set_option('display.max_rows', 10)
print(dfImport.dtypes)
display(dfImport)
dfImport.describe(include="all")

Name                  string[pyarrow]
Email                 string[pyarrow]
Date                  string[pyarrow]
Transaction Amount    string[pyarrow]
Payment Method        string[pyarrow]
Address               string[pyarrow]
Product Name          string[pyarrow]
dtype: object


Unnamed: 0,Name,Email,Date,Transaction Amount,Payment Method,Address,Product Name
0,Amy Frye,fusce.diam@hotmail.net,11/25/2022,17.52,Credit Card,"373-5327 Vulputate, Av.",Wiberg Cure
1,Travis Tyler,phasellus.libero.mauris@outlook.net,05/17/2023,16.54,Cash,"P.O. Box 699, 2987 Lacus. St.",Lettuce - Sea / Sea Asparagus
2,Linda Herrera,pede@hotmail.com,08/25/2017,16.6,Credit Card,Ap #930-3501 Nulla. St.,Mangostein
3,Cody Dotson,ac@google.ca,12/15/2018,13.48,Credit Card,"P.O. Box 713, 5080 Lacinia Rd.",Bulgar
4,Aretha Wilkerson,sit@icloud.net,04/24/2024,14.39,Cash,Ap #839-4960 Ornare Rd.,Cocoa Powder - Dutched
...,...,...,...,...,...,...,...
995,Eliana Thornton,nullam.suscipit.est@outlook.edu,07/31/2018,12.24,Credit Card,5365 Augue St.,Veal - Osso Bucco
996,Drake Luna,gravida.mauris@google.com,02/19/2022,15.18,Cash,642-5311 Nunc Road,Muffin Hinge Container 6
997,Hilda Hall,aliquam.erat.volutpat@google.com,04/28/2022,20.12,Credit Card,652-9803 Velit Av.,Chutney Sauce
998,September Townsend,vitae.sodales.at@hotmail.org,06/07/2017,15.74,Cash,"918-6561 Ultrices, Rd.","Jam - Blackberry, 20 Ml Jar"


Unnamed: 0,Name,Email,Date,Transaction Amount,Payment Method,Address,Product Name
count,1000,1000,995,995.0,1000,1000,1000
unique,1000,974,857,608.0,3,1000,829
top,Amy Frye,Not Provided,04/04/2022,13.84,Credit Card,"373-5327 Vulputate, Av.",Arizona - Green Tea
freq,1,18,4,6.0,716,1,4


At this point, our ability to analyze the data is hampered by the fact that the bad data values are being brought in as strings. We'll proceed to step 3 (cleansing), and complete those steps, then return to the mean/median/mode.

# Step 3
Data Cleaning
- Handle missing values: replace or remove them based on the context
- Address outliers or incorrect values
- Standardize data types for consistency
- Clean and format text or categorical data as necessary

### Bad Data Checks
Reviewing the file in Excel found the following:
- 5 rows with blank dates (those rows will need to be excluded from any date-sensitive calculations)
- multiple date formats (standardize these)
- 7 rows with a value of "Missing" or blank in the Transaction Amount column (exclude these rows entirely)

### Reusable functions
The next cell defines reusable functions that can be used for cleaning up columns of a certain type, defined at the top.

In [146]:
# Define reusable functions that can be used on a column.
def clean_date(strInputDate):
    lstValidFormats = ["%m/%d/%Y", "%m-%d-%Y", "%Y-%m-%d"]

    for fmt in lstValidFormats:
        try:
            cleandate = datetime.strptime(strInputDate, fmt)
            # If we were able to parse the date, return the standardized %m/%d/%Y format
            return datetime.strftime(cleandate, "%m/%d/%Y")
        except:
            pass
    
    # If we've made it here, none of the date patterns matched our data
    return None

def clean_double(strInputDouble):
    try:
        return ''.join(char for char in strInputDouble if char in set("0123456789.eE-+"))
    except:
        return None

def standardize_string_columns(strColName, strInputValue):
    try:
        strReturnValue = dictColMappingData[strColName][strInputValue]
        return strReturnValue
    except:
        return strInputValue

### Apply the cleansing functions defined above

In [147]:
# Apply the data cleansing functions
for c in lstDateColNames:
    for i in dfImport.index:
        dfImport.loc[i, c] = clean_date(dfImport.loc[i, c])

for c in lstDoubleColNames:
    for i in dfImport.index:
        dfImport.loc[i, c] = clean_double(dfImport.loc[i, c])

for c in dictColMappingData:
    for i in dfImport.index:
        dfImport.loc[i, c] = standardize_string_columns(c, dfImport.loc[i, c])

In [151]:
# Create a new dataframe that excludes our major data anomalies that we can't correct
dfClean = dfImport[~((dfImport["Date"].isna()) | (dfImport["Transaction Amount"].str.upper()=="MISSING") | (dfImport["Transaction Amount"].isna()) | (dfImport["Transaction Amount"]==""))].copy()

print("Before:\n", dfClean.dtypes)

# Now we can apply the proper datatypes to the dataframe
for d in lstDateColNames:
    dfClean[d] = dfClean[d].astype("date32[pyarrow]")
for d in lstDoubleColNames:
    dfClean[d] = dfClean[d].astype("double[pyarrow]")

print("\nAfter:\n", dfClean.dtypes)

display(dfClean)

Before:
 Name                  string[pyarrow]
Email                 string[pyarrow]
Date                  string[pyarrow]
Transaction Amount    string[pyarrow]
Payment Method        string[pyarrow]
Address               string[pyarrow]
Product Name          string[pyarrow]
dtype: object

After:
 Name                       string[pyarrow]
Email                      string[pyarrow]
Date                  date32[day][pyarrow]
Transaction Amount         double[pyarrow]
Payment Method             string[pyarrow]
Address                    string[pyarrow]
Product Name               string[pyarrow]
dtype: object


Unnamed: 0,Name,Email,Date,Transaction Amount,Payment Method,Address,Product Name
0,Amy Frye,fusce.diam@hotmail.net,2022-11-25,17.52,Credit Card,"373-5327 Vulputate, Av.",Wiberg Cure
1,Travis Tyler,phasellus.libero.mauris@outlook.net,2023-05-17,16.54,Cash,"P.O. Box 699, 2987 Lacus. St.",Lettuce - Sea / Sea Asparagus
2,Linda Herrera,pede@hotmail.com,2017-08-25,16.6,Credit Card,Ap #930-3501 Nulla. St.,Mangostein
3,Cody Dotson,ac@google.ca,2018-12-15,13.48,Credit Card,"P.O. Box 713, 5080 Lacinia Rd.",Bulgar
4,Aretha Wilkerson,sit@icloud.net,2024-04-24,14.39,Cash,Ap #839-4960 Ornare Rd.,Cocoa Powder - Dutched
...,...,...,...,...,...,...,...
995,Eliana Thornton,nullam.suscipit.est@outlook.edu,2018-07-31,12.24,Credit Card,5365 Augue St.,Veal - Osso Bucco
996,Drake Luna,gravida.mauris@google.com,2022-02-19,15.18,Cash,642-5311 Nunc Road,Muffin Hinge Container 6
997,Hilda Hall,aliquam.erat.volutpat@google.com,2022-04-28,20.12,Credit Card,652-9803 Velit Av.,Chutney Sauce
998,September Townsend,vitae.sodales.at@hotmail.org,2017-06-07,15.74,Cash,"918-6561 Ultrices, Rd.","Jam - Blackberry, 20 Ml Jar"


In [158]:
dfClean.describe(include="all")

Unnamed: 0,Name,Email,Date,Transaction Amount,Payment Method,Address,Product Name
count,989,989,989,989.0,989,989,989
unique,989,963,,,3,989,822
top,Amy Frye,Not Provided,,,Credit Card,"373-5327 Vulputate, Av.",Arizona - Green Tea
freq,1,18,,,707,1,4
mean,,,2020-03-29,15.308372,,,
min,,,2015-09-04,-12.0,,,
25%,,,2018-02-03,13.56,,,
50%,,,2020-03-30,15.1,,,
75%,,,2022-06-09,16.78,,,
max,,,2024-08-19,79.6,,,
