# Imports

In [17]:
import pandas as pd
from datetime import datetime

# Parameters
The following cell contains variables that are used to drive execution and improve reusability with future data sets

In [36]:
# File Location
strFilePath = "../Data/"
strFileName = "Project_1_Data_File.csv"

# Columns to treat as dates when importing
lstDateColNames = ["Date"]

# Step 1
Data Ingestion
- Choose the correct source for the provided sample dataset
- Write code to fetch and load the raw data into a data structure

In [37]:
# Append the file path and file name into a single variable, ensuring we handle the case where the user did not finish the path with a slash
strFullFilePath = strFilePath + ("" if strFilePath[-1:] == "/" else "/") + strFileName

dfImport = pd.read_csv(strFullFilePath, dtype_backend="pyarrow")

# Step 2
Data Assessment and Exploration
- Examine the dataset to identify missing values, outliers, and data quality issues
- To understand the data, generate summary statistics (mean, median, mode, etc)

In [38]:
# Ensure we can look at the full dataset
#pd.set_option('display.max_rows', 10)
print(dfImport.dtypes)
dfImport.head(50)

Name                  string[pyarrow]
Email                 string[pyarrow]
Date                  string[pyarrow]
Transaction Amount    string[pyarrow]
Payment Method        string[pyarrow]
Address               string[pyarrow]
Product Name          string[pyarrow]
dtype: object


Unnamed: 0,Name,Email,Date,Transaction Amount,Payment Method,Address,Product Name
0,Amy Frye,fusce.diam@hotmail.net,11/25/2022,17.52,Credit Card,"373-5327 Vulputate, Av.",Wiberg Cure
1,Travis Tyler,phasellus.libero.mauris@outlook.net,5/17/2023,16.54,Cash,"P.O. Box 699, 2987 Lacus. St.",Lettuce - Sea / Sea Asparagus
2,Linda Herrera,pede@hotmail.com,8/25/2017,16.6,Credit Card,Ap #930-3501 Nulla. St.,Mangostein
3,Cody Dotson,ac@google.ca,12/15/2018,13.48,CC,"P.O. Box 713, 5080 Lacinia Rd.",Bulgar
4,Aretha Wilkerson,sit@icloud.net,4/24/2024,14.39,Cash,Ap #839-4960 Ornare Rd.,Cocoa Powder - Dutched
...,...,...,...,...,...,...,...
45,Lee Faulkner,id.sapien.cras@hotmail.ca,9/25/2019,13.96,Credit Card,Ap #727-3228 Est St.,Table Cloth 144x90 White
46,Vance Chang,nulla@hotmail.couk,4/11/2016,13.42,Credit Card,172-2326 At Avenue,Dates
47,Garrett Williams,quis.turpis@aol.org,6/27/2017,13.63,PayPal,902-2208 Venenatis St.,Soup - Cream Of Broccoli
48,Bernard Lott,et.magnis@aol.couk,5/29/2018,15.28,Credit Card,924-4446 Sem St.,Ham Black Forest


# Step 3
Data Cleaning
- Handle missing values: replace or remove them based on the context
- Address outliers or incorrect values
- Standardize data types for consistency
- Clean and format text or categorical data as necessary

In [32]:
# Define reusable functions that can be used on a column.
def clean_date(strInputDate):
    lstValidFormats = ["%m/%d/%y", "%m-%d-%y", "%Y-%d-%m"]

    for fmt in lstValidFormats:
        try:
            cleandate = datetime.strftime(strInputDate, fmt)
            # If we were able to parse the date, return the standardized %m/%d/%Y format
            return cleandate.strftime("%m/%d/%Y")
        except:
            return strInputDate + "***"

### Apply the cleansing functions defined above

In [40]:
for c in lstDateColNames:
    print(type(c))
    strC = c + "Clean"
    dfImport[strC] = dfImport[c].apply(clean_date)

<class 'str'>


In [42]:
display(dfImport[dfImport["Date"].str != dfImport["DateClean"].str])

KeyError: True