In [1]:
import numpy as np
import pandas as pd
import matplotlib
import math
import requests
import datetime
import regex as re


### Methods for loading the excel sheet into a DataFrame

In [47]:
PATH = "KPMG_VI_New_raw_data_update_final.xlsx"

def load_address():
    columns = [
        "Customer ID",
        "Address",
        "Postcode",
        "State",
        "Country",
        "Property Valuation"
    ]
    return pd.read_excel(PATH, sheet_name="CustomerAddress", header = 1, names = columns)

def load_demograph():
    columns = [
        "Customer ID",
        "First Name",
        "Last Name",
        "Gender",
        '3YR Bike Related Purchases',
        "Date of Birth",
        "Job Title",
        "Job Industry",
        "Wealth Segament",
        "Deceased",
        "Default",
        "Owns Car",
        "Tenure"
    ]
    return pd.read_excel(PATH, sheet_name="CustomerDemographic", header = 2, names = columns)

def load_new_cust_list():
    columns = [
            "First Name",
            "Last Name",
            "Gender",
            "3YR Bike Related Purchases",
            "Date of Birth",
            "Job Title",
            "Job Industry",
            "Wealth Segament",
            "Deceased",
            "Owns Car",
            "Tenure",
            "Address",
            "Postcode",
            "State",
            "Country",
            "Property Valuation",
            "Rank",
            "Value"
        ] 
    df = pd.read_excel(PATH, sheet_name="NewCustomerList", header = 1, index_col= False)
    unnamed = df.loc[:,df.columns.str.match("Unnamed")].columns

    for index in range(len(unnamed)):
        df.drop(unnamed[index], axis = 1, inplace = True)

    old_names = df.columns
    df.rename(columns=dict(zip(old_names, columns)), inplace=True)
    
    return df
    


def load_transactions():
    columns = [
        "Transaction ID",
        "Product ID",
        "Customer ID",
        "Transaction Date",
        "Online Order",
        "Order Status",
        "Brand",
        "Product Line",
        "Product Size",
        "List Price",
        "Standard Cost",
        "Product First Sold Date"
    ] 
    return pd.read_excel(PATH, sheet_name="transactions", header = 1, names = columns)

In [48]:
address = load_address()
#cust.drop(0, axis = 0, inplace= True)
#Modefied address to fix state abbreviations
address.replace(to_replace=['New South Wales', 'Victoria'], value=['NSW', 'VIC'], inplace= True)

#checking for any null values
(address.isnull()).any()

Customer ID           False
Address               False
Postcode              False
State                 False
Country               False
Property Valuation    False
dtype: bool

## Methods for checking Customer Address

### Checking Customer ID 

In [49]:
def check_CID_duplicates(df):
    duplicates = df.duplicated(subset='Customer ID').tolist()
    retList = list()
    print("Checking duplicates")
    if not (all(duplicates)):
        print("No Duplicates Found")
    else:
        for index, value in enumerate(duplicates):
            if value:
                CID = df.loc[index, 'Customer ID']
                retList.append(CID)
                print("Duplicates found - ". index)
    return retList

#Checking for:
    # - Customer ID < 0
    # - NaN values
def check_invalid_ID(df):
    retList = list() 
    invalid_found = False
    print("Checking invalidID")
    for id in df.index:
        CID = df.loc[id, 'Customer ID']
        invalid_bool = CID < 0 or math.isnan(CID)
        if invalid_bool:
            invalid_found = True
            print("Invalid Customer ID found - ", CID)
            retList.append(CID) 
    if not invalid_found:
        print("No invalid IDs found")
        
    return retList

### Checking Address

In [50]:
#Checking to see if all the addresses are in Australia
def check_country(df):
    print("Checking country - Assuming all addresses should be in Australia")
    unique = df['Country'].unique().tolist()
    bool_check = all(['Australia' == x for x in unique])
    retList = list()
    if bool_check:
        print("All addresses in Australia")
    else:
        for index in range(len(df)):
            country = df.loc[index, 'Country']
            CID = df.loc[index, 'Customer ID']
            if country != 'Australia':
                print("Address not in Australia | Index:", CID)
                retList.append(CID)
    return retList



def alt_check_address(df):
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    retList = list()
    bool_check = True
    print("Checking Addresses")
    for index in range(len(df)):
        address = df.loc[index, 'Address']
        if not isinstance(address, str):
            print("No Address found | Index:", index)
            retList.append(index)
            bool_check = False
        else:
            if regex.search(address) != None:
                print("Invalid values found in address | Index:")
                bool_check = False
                retList.append(index)
    if bool_check:
        print("All Addresses Valid")
    return retList
    


def post_code_range_check(df):
    retList = list()
    bool_print = True
    print("Checking Post Codes")
    for index in range(len(df)):
        state = df.loc[index, 'State']
        post_code = df.loc[index, 'Postcode']
        CID =  df.loc[index, 'Customer ID']

        NSW_range = 1000 <= post_code <= 1999 or 2000 <= post_code <= 2599 or 2619 <= post_code <= 2899 or 2921 <= post_code <= 2999
        QLD_range = 4000 <= post_code <= 4999 or 9000 <= post_code <= 9999
        VIC_range = 3000 <= post_code <= 3999 or 8000 <= post_code <= 8999
        code_ranges = {'NSW' : NSW_range, 'QLD' : QLD_range, 'VIC': VIC_range}
        if not code_ranges[state]:
            print("Postcode not in range")
            bool_print = False
            retList.append(CID)
    if bool_print:
        print("All Postcodes are in range")
    return retList

# Inconsistencies found in CustomerAddress:

- State has the following values - ['New South Wales', 'QLD', 'VIC', 'NSW', 'Victoria']. 
    - NSW and New South Wales are both referring to the same state.
    - VIC and Victoria are both referring to the same state.


    ### Implications
    
    ### Fix
    Options
        2 possible fixes - 
        - 


## Methods for checking New Customer List

### Loading the DF

In [51]:
new_cust = load_new_cust_list()
new_cust

  df = pd.read_excel(PATH, sheet_name="NewCustomerList", header = 1, index_col= False)


Unnamed: 0,First Name,Last Name,Gender,3YR Bike Related Purchases,Date of Birth,Job Title,Job Industry,Wealth Segament,Deceased,Owns Car,Tenure,Address,Postcode,State,Country,Property Valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.718750
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.718750
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.718750
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ferdinand,Romanetti,Male,60,1959-10-07,Paralegal,Financial Services,Affluent Customer,N,No,9,2 Sloan Way,2200,NSW,Australia,7,996,0.374000
996,Burk,Wortley,Male,22,2001-10-17,Senior Sales Associate,Health,Mass Customer,N,No,6,04 Union Crossing,2196,NSW,Australia,10,997,0.357000
997,Melloney,Temby,Female,17,1954-10-05,Budget/Accounting Analyst IV,Financial Services,Affluent Customer,N,Yes,15,33475 Fair Oaks Junction,4702,QLD,Australia,2,997,0.357000
998,Dickie,Cubbini,Male,30,1952-12-17,Financial Advisor,Financial Services,Mass Customer,N,Yes,19,57666 Victoria Way,4215,QLD,Australia,2,997,0.357000


### Check DOB

In [53]:
# Max Age - 82 : The average life expectancy in Australia is 82.9, therefore, it would be prudent to slightly underestimate that so that we could confirm that the people above that age are still alive 
#before we create a marketing plan and start spending resources, especialy if the cost per lead is significantly high4
# Min Age - 18
def DOB_check(df, max_age, min_age):
    bool_check = True
    retList = list()
    print('Checking DOB')
    for index in range(len(df)):
        year = df.loc[index, 'Date of Birth'].year
        deceased = df.loc[index, 'Deceased']
        curr_year = datetime.date.today().year
        difference = curr_year - year
        if not math.isnan(year):
            if difference > max_age and deceased == 'N':
                print(f'Over {max_age} | Current Age:', difference, '| Index:', index)
                bool_check = False
                retList.append(index)
            elif difference < min_age and deceased == 'N':
                print(f'Over {min_age} | Current Age:', difference, '| Index:', index)
                bool_check = False
                retList.append(index)
        else:
            print("Invalid year | Index:", index)
    
    if bool_check:
        print("All DOBs are valid")

def bike_purchase_check(df):
    bool_check = True
    retList = list()
    print("Checking Bike Purchase Hist")
    for i in range(len(df)):
        num = df.loc[i, '3YR Bike Related Purchases']
        if math.isnan(num):
            bool_check = False
            print("Invalid Purchase Found | Index: ", num)
            retList.append(i)
    if bool_check:
        print("Purchases are valid")
    return retList

def gender_check(df):
    unique = df['Gender'].unique().tolist()
    print("All Unique Gender: ", unique)


def name_check(df):
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    retList = list()
    bool_check = True
    print("Checking Names")
    for i in range(len(df)):
        first = new_cust.loc[i, 'First Name']
        last = new_cust.loc[i, 'Last Name']
        
        first_valid = isinstance(first, str) and first.strip() and regex.search(first) == None
        last_valid = isinstance(last, str) and last.strip() and regex.search(last) == None

        if not (first_valid or last_valid):
            bool_check = False
            print("Name invalid | Index: ", i)
            retList.append(i)
        elif not first_valid:
            print("First Name Invalid | Index: ", i)
            retList.append(i)
        elif not last_valid:
            print("Last Name Invalid | Index:", i)
            retList.append(i)
        
    if bool_check:
        print("All Names are Valid")
    return retList

def car_check(df):
    retList = list()
    check_set = {'Yes', 'No'}
    unique = df['Owns Car'].unique().tolist()
    bool_check = set(unique) == check_set
    print("Checking Car Values")
    if bool_check:
        print("All values are valid")
    else:
        for index in range(len(df)):
            owns_car = df.loc[index, 'Owns Car']
            if math.isnan(owns_car) != None or not isinstance(owns_car, str):
                print("Invalid Entry | Index: ", index)
                retList.append(index)
    return retList

def check_addr_duplicates(df):
    bool_list = df['Address'].duplicated(keep = False)
    duplicates = df[bool_list]
    dupe_list = duplicates['Address'].unique().tolist()
    for item in dupe_list:
        indices = df.index[df['Address'] == item].tolist()
        list_post_codes = df.loc['Address' == item]['Postcode'].tolist()
        list_states = df.loc['Address' == item]['State'].tolist()
        list_valuation = df.loc['Address' == item]['Property Valuation'].tolist()
        found_dupes = [x for x in duplicates.loc['Address' == item]]

        bool_post_codes = list_post_codes.count(list_post_codes[0]) == len(list_post_codes)
        bool_states = list_states.count(list_states[0]) == len(list_states)


dupe = new_cust.loc



## Cross Check

In [None]:
def cross_check_address(df1, df2):
    #df_diff = pd.concat([df1['Address'],df2['Address']]).duplicated(keep=False)
    df1_list = df1['Address'].tolist()
    df2_list = df2['Address'].tolist()
    df1_index = pd.Index(df1_list)
    df2_index = pd.Index(df2_list)
     
    for addr in df1_list:
        pass 
    

#df_diff = pd.concat([address['Address'], new_cust['Address']]).duplicated(keep=False)

df1_list = new_cust['Address'].tolist()
df2_list = address['Address'].duplicated(keep = False)
address[df2_list]


Unnamed: 0,Customer ID,Address,Postcode,State,Country,Property Valuation
732,737,3 Talisman Place,4811,QLD,Australia,2
2315,2320,64 Macpherson Junction,2208,NSW,Australia,11
2328,2333,3 Mariners Cove Terrace,3108,VIC,Australia,10
2470,2475,3 Talisman Place,4017,QLD,Australia,5
2980,2985,3 Mariners Cove Terrace,2216,NSW,Australia,10
3535,3540,64 Macpherson Junction,4061,QLD,Australia,8
