# Testing script

Script for testing out classes created for Multinational Retail Data Centralisation project. 

**Note** Update docstrings for classes and methods accordingly as functionality develops. 


## Load in modules of classes

In [None]:
import database_utils
import data_extraction
import data_cleaning


## Creating instances of each class

In [None]:
connector = database_utils.DatabaseConnector()
extractor = data_extraction.DataExtractor()
cleaning = data_cleaning.DataCleaning()

### User data

Utilise class methods to connect to AWS RDS database and retrieve list of table names from postgres database.

In [None]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

Extract table data into a panda dataframe. 

In [None]:
rds_df = extractor.read_rds_table('legacy_users', engine)
rds_df.head()

In [None]:
print(rds_df.to_string())

In [None]:
#check spelling
print(rds_df['country'].unique())
print(rds_df['country_code'].unique())

In [None]:
clean_rds_df = cleaning.clean_user_data(rds_df)

print(type(clean_rds_df))

In [None]:
#validate cleaning worked somewhat
print(clean_rds_df['country'].unique())
print(clean_rds_df['country_code'].unique())
print(clean_rds_df.head())

Upload cleaned up data to sales_database

In [None]:
connector.upload_to_db(clean_rds_df, 'dim_users')

## Card Details data

In [None]:
#extracting data
card_df = extractor.retrieve_pdf_data()

In [None]:
card_df2 = pd.concat(card_df)

In [None]:
card_df2.info() #15309 -> 15284

In [None]:
print(card_df2.to_string())

In [None]:
card_df2[card_df2['expiry_date'].astype(str).str.len() > 5]

In [None]:
#checking for any na/null values
card_df2.isnull().sum().sum()

In [None]:
card_df2 = card_df2[card_df2['expiry_date'].astype(str).str.len() == 5] #keeping rows on this condition
card_df2['card_number'] = card_df2['card_number'].replace(regex=[r'\D+'], value="")  #retaining only numeric
card_df2.date_payment_confirmed = pd.to_datetime(card_df2.date_payment_confirmed, format='mixed', errors="coerce")
card_df2['card_number'] = card_df2['card_number'].astype('int64')    

In [None]:
#cleaning card details data
clean_card_df = cleaning.clean_card_data(card_df)
clean_card_df

In [None]:
#Upload cleaned data to local database
connector.upload_to_db(clean_card_df, 'dim_card_details')

## Store details

endpoint = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'

header = {'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'}

In [11]:
num_stores = extractor.list_number_of_stores('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores',{'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'} )
num_stores

{'statusCode': 200, 'number_stores': 451}

In [12]:
print(type(num_stores))

<class 'dict'>


In [None]:
stores_df2 = extractor.retrieve_stores_data(num_stores,'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/')

In [None]:
stores_df = stores_df2.copy()

In [None]:
print(stores_df.to_string())

In [None]:
#testing cleaning methods
clean_stores_df = cleaning.clean_store_data(stores_df)
clean_stores_df.head(10)

In [None]:
#Upload cleaned data to local database
connector.upload_to_db(clean_stores_df, 'dim_store_details')

## Product details

Milestone 2 - Task 6

In [None]:
product_df = extractor.extract_from_s3('s3://data-handling-public/products.csv')

In [None]:
product_df.info()

In [None]:
print(product_df.to_string())

In [None]:
print(product_df['product_code'].str.contains('A8-4686892S', case= False).to_string())

In [None]:
product_df.loc[1841]['weight']

In [None]:
product_df_weight = cleaning.convert_product_weights(product_df)

In [None]:
clean_product_df = cleaning.clean_products_data(product_df_weight)

In [None]:
connector.upload_to_db(clean_product_df, 'dim_products')

## Order data

Create a method in DataCleaning called clean_orders_data which will clean the orders table data.

You should remove the columns, first_name, last_name and 1 to have the table in the correct form before uploading to the database.

You will see that the orders data contains column headers which are the same in other tables.

This table will act as the source of truth for your sales data and will be at the center of your star based database schema.



Once cleaned upload using the upload_to_db method and store in a table called orders_table,

In [None]:
creds = connector.read_db_creds()
engine = connector.init_db_engine()
db_list = connector.list_db_tables()

In [None]:
orders_df = extractor.read_rds_table('orders_table', engine)

In [None]:
orders_df.head()

In [None]:
print(orders_df.to_string())

In [None]:
for column in orders_df:
    print(column,"->", orders_df[column].astype(str).str.len().max())

In [None]:
clean_orders_df = cleaning.clean_orders_data(orders_df)

#note had to delete level_0 column as was causing duplicate col error, which couldn't find resolution
clean_orders_df.info()

In [None]:
connector.upload_to_db(clean_orders_df, 'orders_table')

Finding out the maximum value length in each column. 

In [None]:
for column in clean_orders_df:
    print(column,"->", clean_orders_df[column].astype(str).str.len().max())

In [None]:
import numpy as np
import pandas as pd

## Events data


In [None]:
#https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json
events_df = extractor.extract_from_s3('https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json')

In [None]:
events_df.info()

In [None]:
clean_event_df = cleaning.clean_events_data(events_df)

In [None]:
connector.upload_to_db(clean_event_df, 'dim_date_times')