# Testing script

Script for testing class modules created for Multinational Retail Data Centralisation project. 

Examples of database connecting, data extraction, data cleaning and uploading clean data to centralised database.


## Set up

###  Load in modules of classes

In [1]:
import Classes.database_utils as db_utils
import Classes.data_extraction as data_ext
import Classes.data_cleaning as data_clean


### Creating instances of each class

In [2]:
connector = db_utils.DatabaseConnector()
extractor = data_ext.DataExtractor()
cleaning = data_clean.DataCleaning()

## User data

### AWS RDS database connection

Utilise class methods to connect to AWS RDS database and retrieve list of table names from postgres database.

In [None]:
# Reading in AWS RDS database credentials from .yaml file.
creds = connector.read_db_creds()

# Create engine and connecting to AWS RDS database.
engine = connector.init_db_engine()

# Printing list of available tables names in AWS RDS database
db_list = connector.list_db_tables()

### Data download from AWS RDS


In [None]:
# Download data from 'legacy_users' table, using the AWS RDS specified connection engine.
rds_df = extractor.read_rds_table('legacy_users', engine)
rds_df.head()

### Data Cleaning

In [None]:
clean_rds_df = cleaning.clean_user_data(rds_df)

### Uploading dataframe to centralised database:

In [None]:
connector.upload_to_db(clean_rds_df, 'dim_users')

## Card Details data

Data extraction from .pdf files

In [None]:
#extracting data
card_df = extractor.retrieve_pdf_data()

In [None]:
#cleaning card details data
clean_card_df = cleaning.clean_card_data(card_df)

In [None]:
#Upload cleaned data to local database
connector.upload_to_db(clean_card_df, 'dim_card_details')

## Store details

Data extraction from web API

In [None]:
num_stores = extractor.list_number_of_stores('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores',{'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'} )


Data extraction from multiple web APIs can take some time and noted as a possible project bottleneck.

In [None]:
stores_df = extractor.retrieve_stores_data(num_stores,'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/')

In [None]:
clean_stores_df = cleaning.clean_store_data(stores_df)

In [None]:
connector.upload_to_db(clean_stores_df, 'dim_store_details')

## Product details

Data extraction from AWS s3 bucket.

In [3]:
product_df = extractor.extract_from_s3('s3://data-handling-public/products.csv')

In [4]:
product_df_weight = cleaning.convert_product_weights(product_df)

In [6]:
clean_product_df = cleaning.clean_products_data(product_df_weight)

In [None]:
connector.upload_to_db(clean_product_df, 'dim_products')

## Order data

In [None]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

In [None]:
orders_df = extractor.read_rds_table('orders_table', engine)

In [None]:
clean_orders_df = cleaning.clean_orders_data(orders_df)

In [None]:
connector.upload_to_db(clean_orders_df, 'orders_table')

## Events data


In [None]:
events_df = extractor.extract_from_s3('https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json')

In [None]:
clean_event_df = cleaning.clean_events_data(events_df)

In [None]:
connector.upload_to_db(clean_event_df, 'dim_date_times')