# Cleansing customer data

In [1]:
import csv

import json

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

# Validate the city, state, and zip for stage_1_peak_customers against the zip_codes table

AGM does not want to give its customer list to 3rd party sales channels, including Peak Delivery.  For that reason, we can expect some variation in customer first and last names, and in the street.  However, the city, state, and zip should be validated by Peak's system, so we do not anticipate any issues.

In [6]:
rollback_before_flag = True
rollback_after_flag = True

query = """
select stage_id 
from 
stage_1_peak_customers
where (city, state, zip) not in (select city, state, zip from zip_codes)
order by stage_id

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id


# Find all customer records in stage_1_peak_customers where any of first_name, last_name, and/or street do not match a customer in the customers table

AGM does not want to give its customer list to 3rd party sales channels, including Peak Delivery.  For that reason, we can expect some variation in customer first and last names, and in the street.

In [11]:
rollback_before_flag = True
rollback_after_flag = True

query = """
select *
from
stage_1_peak_customers
where (first_name, last_name, street) not in (select first_name, last_name, street from customers)
order by stage_id
"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,stage_id,sale_id,customer_id,first_name,last_name,street,city,state,zip
0,10,5763728768,3729016,Hyrum,Knuckles,86668 Spenser Terrace,Oakland,CA,94618
1,20,5763728877,3728936,Roseann,Coyish,11707 American Ash Ter,Orinda,CA,94563
2,24,5763728428,3729287,Hali,Ducker,8 Orion Pass,El Cerrito,CA,94530
3,26,5763728393,3728674,Melantha,Golborn,6140 North Field Alley,Orinda,CA,94563
4,36,5763729212,3729191,Eleni,Jansen,66 Bartelt Hill,Oakland,CA,94607
5,40,5763729129,3728856,Clyve,Humonds,22 Brent Wood Hill,Berkeley,CA,94709
6,51,5763728864,3729178,Rutledge,Hellwing,606 Gulf Plz,El Cerrito,CA,94530
7,60,5763729313,3728402,Kalli,Kemel,18373 Golf View Pass,Berkeley,CA,94702
8,72,5763728980,3729213,Honina,Philson,28 Clarendon Plaza,Berkeley,CA,94702
9,73,5763728921,3729194,Nicky,Haley,88424 Warrior Lane,Oakland,CA,94602


# Find the percentage of Peak's customer records that do not match to AGM's customers table

Write a query to find the percentage of Peak's customer records that do not match AGM's. The percentage can be found by taking the number of customer records in stage_1_peak_customers that do not match and dividing by the number of customers records in stage_1_peak_customers and multiplying by 100.

In [78]:
rollback_before_flag = True
rollback_after_flag = True

query = """
with tc as
(select count(*) as total_peak_customers 
from stage_1_peak_customers),
tm as
(select count(*) as total_matching_customers 
from stage_1_peak_customers 
where (first_name, last_name, street) 
in (select first_name, last_name, street from customers)
    )
    


select total_peak_customers, 
total_matching_customers, 
total_peak_customers-total_matching_customers as total_not_matching_customers,
round(((total_peak_customers::numeric-total_matching_customers::numeric)*100)/total_peak_customers::numeric, 1) as percent_not_matching_customers

from tc, tm


"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,total_peak_customers,total_matching_customers,total_not_matching_customers,percent_not_matching_customers
0,97,84,13,13.4
