# Testing script

Script for testing out classes created for Multinational Retail Data Centralisation project. 

**Note** Update docstrings for classes and methods accordingly as functionality develops. 


## Load in modules of classes

In [1]:
import database_utils
import data_extraction
import data_cleaning


## Creating instances of each class

In [2]:
connector = database_utils.DatabaseConnector()
extractor = data_extraction.DataExtractor()
cleaning = data_cleaning.DataCleaning()

### User data

Utilise class methods to connect to AWS RDS database and retrieve list of table names from postgres database.

In [3]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

['legacy_store_details', 'legacy_users', 'orders_table']


Extract table data into a panda dataframe. 

In [4]:
rds_df = extractor.read_rds_table('legacy_users', engine)
rds_df.head()

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579


In [None]:
print(rds_df.to_string())

In [None]:
#check spelling
print(rds_df['country'].unique())
print(rds_df['country_code'].unique())

In [5]:
clean_rds_df = cleaning.clean_user_data(rds_df)

print(type(clean_rds_df))

<class 'pandas.core.frame.DataFrame'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rds_df["phone_number"] = rds_df.apply(correct_phone_number, axis=1)


In [7]:
#validate cleaning worked somewhat
print(clean_rds_df['country'].unique())
print(clean_rds_df['country_code'].unique())
print(clean_rds_df.head())

['Germany' 'United Kingdom' 'United States']
['DE' 'GB' 'US']
   index first_name last_name date_of_birth                    company  \
0      0   Sigfried     Noack    1990-09-30         Heydrich Junitz KG   
1      1        Guy     Allen    1940-12-01                    Fox Ltd   
2      2      Harry  Lawrence    1995-08-02  Johnson, Jones and Harris   
3      3     Darren   Hussain    1972-09-23                Wheeler LLC   
4      4      Garry     Stone    1952-12-20                 Warner Inc   

                  email_address  \
0             rudi79@winkler.de   
1  rhodesclifford@henderson.com   
2  glen98@bryant-marshall.co.uk   
3    daniellebryan@thompson.org   
4       billy14@long-warren.com   

                                          address         country  \
0                     Zimmerstr. 1/0 59015 Gießen         Germany   
1  Studio 22a Lynne terrace McCarthymouth TF0 9GH  United Kingdom   
2                92 Ann drive Joanborough SK0 6LR  United Kingdom   
3     

Upload cleaned up data to sales_database

In [8]:
connector.upload_to_db(clean_rds_df, 'dim_users')

## Card Details data

In [9]:
#extracting data
card_df = extractor.retrieve_pdf_data()

In [11]:
#cleaning card details data
clean_card_df = cleaning.clean_card_data(card_df)
clean_card_df

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
0,30060773296197,09/26,Diners Club / Carte Blanche,2015-11-25
1,349624180933183,10/23,American Express,2001-06-18
3,213142929492281,09/27,JCB 15 digit,2011-02-12
5,3506661913512980,11/23,JCB 16 digit,2003-08-25
6,377549437870679,07/27,American Express,2006-12-11
...,...,...,...,...
11,6011673872296230,10/27,Discover,2011-11-28
12,4366158312869550,06/26,VISA 16 digit,1993-10-19
14,180036921556789,12/28,JCB 15 digit,1997-06-06
16,3569953313547220,04/24,JCB 16 digit,2020-02-05


In [12]:
#Upload cleaned data to local database
connector.upload_to_db(clean_card_df, 'dim_card_details')

## Store details

endpoint = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'

header = {'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'}

In [None]:
num_stores = extractor.list_number_of_stores('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores',{'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'} )

In [None]:
stores_df2 = extractor.retrieve_stores_data(num_stores,'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/')

In [None]:
stores_df = stores_df2.copy()

In [None]:
#testing cleaning methods
clean_stores_df = cleaning.called_clean_store_data(stores_df)
clean_stores_df.head(10)

In [None]:
#Upload cleaned data to local database
connector.upload_to_db(clean_stores_df, 'dim_store_details')

## Product details

Milestone 2 - Task 6

In [None]:
product_df = extractor.extract_from_s3('s3://data-handling-public/products.csv')

In [None]:
product_df.info()

In [None]:
product_df_weight = cleaning.convert_product_weights(product_df)

In [None]:
clean_product_df = cleaning.clean_products_data(product_df_weight)

In [None]:
connector.upload_to_db(clean_product_df, 'dim_products')

## Order data

Create a method in DataCleaning called clean_orders_data which will clean the orders table data.

You should remove the columns, first_name, last_name and 1 to have the table in the correct form before uploading to the database.

You will see that the orders data contains column headers which are the same in other tables.

This table will act as the source of truth for your sales data and will be at the center of your star based database schema.



Once cleaned upload using the upload_to_db method and store in a table called orders_table,

In [None]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

In [None]:
orders_df = extractor.read_rds_table('orders_table', engine)

In [None]:
orders_df.head()

In [None]:
clean_orders_df = cleaning.clean_orders_data(orders_df)

#note had to delete level_0 column as was causing duplicate col error, which couldn't find resolution
clean_orders_df.info()

In [None]:
connector.upload_to_db(clean_orders_df, 'orders_table')

Finding out the maximum value length in each column. 

In [None]:
for column in clean_orders_df:
    print(column,"->", clean_orders_df[column].astype(str).str.len().max())

In [None]:
import numpy as np
import pandas as pd

## Events data


In [None]:
#https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json
events_df = extractor.extract_from_s3('https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json')

In [None]:
events_df.info()

In [None]:
clean_event_df = cleaning.clean_events_data(events_df)

In [None]:
connector.upload_to_db(clean_event_df, 'dim_date_times')