# Testing script

Script for testing out classes created for Multinational Retail Data Centralisation project. 

**Note** Update docstrings for classes and methods accordingly as functionality develops. 


## Load in modules of classes

In [1]:
import database_utils
import data_extraction
import data_cleaning
import pandas as pd
import numpy as np

In [2]:
connector = database_utils.DatabaseConnector()
extractor = data_extraction.DataExtractor()
cleaning = data_cleaning.DataCleaning()

Utilise class methods to connect to AWS RDS database and retrieve list of table names from postgres database.

In [None]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

Extract table data into a panda dataframe. 

In [None]:
rds_df = extractor.read_rds_table('legacy_users', engine)
rds_df.head()

## Cleaning user data testing

This is testing scripts for cleaning up phone_numbers. 

### Phone number cleaning

This is a bit more complicated than expected

In [None]:
rds_df['phone_number'] = np.where(rds_df['phone_number'].str.startswith('A-Z|a-z'), np.nan, rds_df['phone_number'])

isd_code_map = { "GB": "+44", "DE": "+49", "US": "+1" }

def correct_phone_number(row):
  import re
  # Remove special chars other than digits, `+` and letters used for extension e.g. `x`, `ext` (following keeps all alphabets).
  result = re.sub("[^A-Za-z\d\+]", "", row["phone_number"])
  
  # Prefix ISD code by matching country code.
  if not result.startswith(isd_code_map[row["country_code"]]):
    result = isd_code_map[row["country_code"]] + result

  # Remove `0` that follows ISD code.
  if result.startswith(isd_code_map[row["country_code"]] + "0"):
    result = result.replace(isd_code_map[row["country_code"]] + "0", isd_code_map[row["country_code"]])
  return result

rds_df["Corrected Phone Number"] = rds_df.apply(correct_phone_number, axis=1)


In [None]:
#regex_expression = '^(?:(?:\(?(?:0(?:0|11)\)?[\s-]?\(?|\+)44\)?[\s-]?(?:\(?0\)?[\s-]?)?)|(?:\(?0))(?:(?:\d{5}\)?[\s-]?\d{4,5})|(?:\d{4}\)?[\s-]?(?:\d{5}|\d{3}[\s-]?\d{3}))|(?:\d{3}\)?[\s-]?\d{3}[\s-]?\d{3,4})|(?:\d{2}\)?[\s-]?\d{4}[\s-]?\d{4}))(?:[\s-]?(?:x|ext\.?|\#)\d{3,4})?$' #Our regular expression to match
#rds_df['phone_number'] = np.where(rds_df['country'] == 'United Kingdom', rds_df['phone_number'].str.match(regex_expression), np.nan)

np.where(rds_df['phone_number'].str.startswith('A-Z|a-z'), np.nan, rds_df['phone_number'])

Cleaning up of User data using pandas. 
These test methods will be finalised and entered into the data_cleaning class methods. 

Check address formatting issues - seems there could be issues with /n - regrex



In [None]:
clean_rds_df = cleaning.clean_user_data(rds_df)

print(type(clean_rds_df))
print(clean_rds_df.to_string())

Upload cleaned up data to sales_database

In [None]:
connector.upload_to_db(clean_rds_df, 'dim_users')

## Card Details data

In [None]:
#extracting data
card_df = extractor.retrieve_pdf_data()

In [None]:
#cleaning card details data
cleaning = data_cleaning.DataCleaning()
clean_card_df = cleaning.clean_card_data(card_df)
clean_card_df

In [None]:
#Upload cleaned data to local database
connector.upload_to_db(clean_card_df, 'dim_card_details')

# Store details

endpoint = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'

header = {'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'}

In [None]:
num_stores = extractor.list_number_of_stores('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores',{'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'} )

In [None]:
stores_df2 = extractor.retrieve_stores_data(num_stores,'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/')

In [None]:
stores_df = stores_df2.copy()

In [None]:
#testing cleaning methods
clean_stores_df = cleaning.called_clean_store_data(stores_df)
clean_stores_df.head(10)

In [None]:
#Upload cleaned data to local database
connector.upload_to_db(clean_stores_df, 'dim_store_details')

## Product details

Milestone 2 - Task 6

In [None]:
product_df = extractor.extract_from_s3('s3://data-handling-public/products.csv')

In [None]:
product_df.info()

In [None]:
product_df_weight = cleaning.convert_product_weights(product_df)

In [None]:
clean_product_df = cleaning.clean_products_data(product_df_weight)

In [None]:
connector.upload_to_db(clean_product_df, 'dim_products')

## Order data

Create a method in DataCleaning called clean_orders_data which will clean the orders table data.

You should remove the columns, first_name, last_name and 1 to have the table in the correct form before uploading to the database.

You will see that the orders data contains column headers which are the same in other tables.

This table will act as the source of truth for your sales data and will be at the center of your star based database schema.



Once cleaned upload using the upload_to_db method and store in a table called orders_table,

In [None]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

In [None]:
orders_df = extractor.read_rds_table('orders_table', engine)

In [None]:
orders_df.head()

In [None]:
clean_orders_df = cleaning.clean_orders_data(orders_df)

#note had to delete level_0 column as was causing duplicate col error, which couldn't find resolution
clean_orders_df.info()

In [None]:
connector.upload_to_db(clean_orders_df, 'orders_table')

## Events data


In [None]:
#https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json


In [3]:
events_df = extractor.extract_from_s3('https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json')
events_df.head()

Unnamed: 0,timestamp,month,year,day,time_period,date_uuid
0,22:00:06,9,2012,19,Evening,3b7ca996-37f9-433f-b6d0-ce8391b615ad
1,22:44:06,2,1997,10,Evening,adc86836-6c35-49ca-bb0d-65b6507a00fa
2,10:05:37,4,1994,15,Morning,5ff791bf-d8e0-4f86-8ceb-c7b60bef9b31
3,17:29:27,11,2001,6,Midday,1b01fcef-5ab9-404c-b0d4-1e75a0bd19d8
4,22:40:33,12,2015,31,Evening,dfa907c1-f6c5-40f0-aa0d-40ed77ac5a44


In [4]:
clean_event_df = cleaning.clean_events_data(events_df)

In [5]:
connector.upload_to_db(clean_event_df, 'dim_date_times')