# Data quality Report

The following notebook can be run after running the "create_tables.py" and "etl.py" 

The notebook gives some insights as the the nature and structure of the data sets.

In [None]:
import os
import glob
import psycopg2
import pandas as pd
import pandas.io.sql as sqlio
from sql_queries import *
import configparser

In [None]:

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

#Connect to the Database
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
DB_ENDPOINT = config.get("CLUSTER","HOST" )
DB_ROLE_ARN = config.get("IAM_ROLE", "ARN") 
DB_NAME                 = config.get("CLUSTER","DB_NAME")
DB_USER            = config.get("CLUSTER","DB_USER")
DB_PASSWORD        = config.get("CLUSTER","DB_PASSWORD")
DB_PORT               = config.get("CLUSTER","DB_PORT")
DB_IAM_ROLE_NAME      = config.get("CLUSTER", "DB_IAM_ROLE_NAME")
DB_NODE_TYPE          = config.get("CLUSTER","DB_NODE_TYPE")
DB_CLUSTER_TYPE       = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES          = config.get("CLUSTER","DB_NUM_NODES")

DB_CLUSTER_IDENTIFIER = config.get("CLUSTER","DB_CLUSTER_IDENTIFIER")



In [None]:
# Setconfigurations for the redshift cluster and create an IAM role that has read access to S3.
import boto3

ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )


In [None]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    # In this case the correct security group that works is the last in the list
    defaultSg = list(vpc.security_groups.all())[-1]
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DB_PORT),
        ToPort=int(DB_PORT)
    )
except Exception as e:
    print(e)

In [None]:
conn = psycopg2.connect(dbname=DB_NAME, host=DB_ENDPOINT, port=DB_PORT, user=DB_USER, password=DB_PASSWORD)
cur = conn.cursor()

# Song Play Table - Data Quality Report

In [None]:
df_songplays = sqlio.read_sql_query("Select * from songplays", conn)

# Create a dataframe of the data type of each column
data_types = pd.DataFrame(df_songplays.dtypes,
                          columns=['songplays Data Type'])


missing_data_counts = pd.DataFrame(df_songplays.isnull().sum(),
                                   columns=['songplays Missing Values'])

# Create a dataframe with the count of present values in each column
present_data_counts = pd.DataFrame(df_songplays.count(),
                                   columns=['Present Values'])

# Create a dataframe with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df_songplays.columns.values):
    unique_value_counts.loc[v] = [df_songplays[v].nunique()]
    
    
# Create a dataframe with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(df_songplays.columns.values):
    minimum_values.loc[v] = [df_songplays[v].astype(str).min()]

    
# Create a dataframe with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(df_songplays.columns.values):
    maximum_values.loc[v] = [df_songplays[v].astype(str).max()]
    
    
    # Create a dataframe with the minimum length of value in each column
minimumm_length = pd.DataFrame(columns=['Minimum Length'])
for v in list(df_songplays.columns.values):
    minimumm_length.loc[v] = [df_songplays[v].astype(str).map(len).min()]
    
# Create a dataframe with the maximum Length of value in each column
maximum_length = pd.DataFrame(columns=['Maximum Length'])
for v in list(df_songplays.columns.values):
    maximum_length.loc[v] = [df_songplays[v].astype(str).map(len).max()]


# Merge all the dataframes together by the index
songplays_data_quality = data_types.join(present_data_counts).join(missing_data_counts).join(minimum_values).join(maximum_values).join(minimumm_length).join(maximum_length)
songplays_data_quality 

# 'songs' Table - Data Quality Report

In [None]:
df_songs = sqlio.read_sql_query("Select * from songs", conn)

# Create a dataframe of the data type of each column
data_types = pd.DataFrame(df_songs.dtypes,
                          columns=['Songs Data Type'])

missing_data_counts = pd.DataFrame(df_songs.isnull().sum(),
                                   columns=['Songs Missing Values'])

# Create a dataframe with the count of present values in each column
present_data_counts = pd.DataFrame(df_songs.count(),
                                   columns=['Present Values'])

# Create a dataframe with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df_songs.columns.values):
    unique_value_counts.loc[v] = [df_songs[v].nunique()]
    
# Create a dataframe with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(df_songs.columns.values):
    minimum_values.loc[v] = [df_songs[v].min()]
 
    
# Create a dataframe with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(df_songs.columns.values):
    maximum_values.loc[v] = [df_songs[v].max()]
 

 # Create a dataframe with the minimum length of value in each column
minimumm_length = pd.DataFrame(columns=['Minimum Length'])
for v in list(df_songs.columns.values):
    minimumm_length.loc[v] = [df_songs[v].astype(str).map(len).min()]
    
    
# Create a dataframe with the maximum Length of value in each column
maximum_length = pd.DataFrame(columns=['Maximum Length'])
for v in list(df_songs.columns.values):
    maximum_length.loc[v] = [df_songs[v].astype(str).map(len).max()]
    

# Merge all the dataframes together by the index
songs_data_quality = data_types.join(present_data_counts).join(missing_data_counts).join(minimum_values).join(maximum_values).join(minimumm_length).join(maximum_length)
songs_data_quality 

# 'artist' Table - Data Quality Report


In [None]:
df_artists = sqlio.read_sql_query("Select * from artists", conn)

# Create a dataframe of the data type of each column
data_types = pd.DataFrame(df_artists.dtypes,
                          columns=['Artists Data Type'])


missing_data_counts = pd.DataFrame(df_artists.isnull().sum(),
                                   columns=['Artists Missing Values'])

# Create a dataframe with the count of present values in each column
present_data_counts = pd.DataFrame(df_artists.count(),
                                   columns=['Present Values'])

# Create a dataframe with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df_artists.columns.values):
    unique_value_counts.loc[v] = [df_artists[v].nunique()]
    
# Create a dataframe with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(df_artists.columns.values):
    
    minimum_values.loc[v] = [df_artists[v].astype(str).map(len).min()]
 
    
# Create a dataframe with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(df_artists.columns.values):
    maximum_values.loc[v] = [df_artists[v].astype(str).map(len).max()]
 

 # Create a dataframe with the minimum length of value in each column
minimumm_length = pd.DataFrame(columns=['Minimum Length'])
for v in list(df_artists.columns.values):
    minimumm_length.loc[v] = [df_artists[v].astype(str).map(len).min()]
    
    
# Create a dataframe with the maximum Length of value in each column
maximum_length = pd.DataFrame(columns=['Maximum Length'])
for v in list(df_artists.columns.values):
    maximum_length.loc[v] = [df_artists[v].astype(str).map(len).max()]

# Merge all the dataframes together by the index
artists_data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(minimum_values).join(maximum_values).join(minimumm_length).join(maximum_length)
artists_data_quality_report

# 'users' Table - Data Quality Report

In [None]:
df_users = sqlio.read_sql_query("Select * from users", conn)

# Create a dataframe of the data type of each column
data_types = pd.DataFrame(df_users.dtypes,
                          columns=['Users Data Type'])


missing_data_counts = pd.DataFrame(df_users.isnull().sum(),
                                   columns=['Users Missing Values'])

# Create a dataframe with the count of present values in each column
present_data_counts = pd.DataFrame(df_users.count(),
                                   columns=['Present Values'])

# Create a dataframe with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df_users.columns.values):
    unique_value_counts.loc[v] = [df_users[v].nunique()]
    
# Create a dataframe with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(df_users.columns.values):    
    minimum_values.loc[v] = [df_users[v].min()]
 
    
# Create a dataframe with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(df_users.columns.values):
    maximum_values.loc[v] = [df_users[v].max()]
 

 # Create a dataframe with the minimum length of value in each column
minimumm_length = pd.DataFrame(columns=['Minimum Length'])
for v in list(df_users.columns.values):
    minimumm_length.loc[v] = [df_users[v].astype(str).map(len).min()]
    
    
# Create a dataframe with the maximum Length of value in each column
maximum_length = pd.DataFrame(columns=['Maximum Length'])
for v in list(df_users.columns.values):
    maximum_length.loc[v] = [df_users[v].astype(str).map(len).max()]

# Merge all the dataframes together by the index
users_data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(minimum_values).join(maximum_values).join(minimumm_length).join(maximum_length)
users_data_quality_report

# 'time' Table - Data Quality Report

In [None]:

df_time = sqlio.read_sql_query("Select hour, day, week, month, year, weekday from time", conn)

# Create a dataframe of the data type of each column
data_types = pd.DataFrame(df_time.dtypes,
                          columns=['Time Data Type'])


missing_data_counts = pd.DataFrame(df_time.isnull().sum(),
                                   columns=['Time Missing Values'])

# Create a dataframe with the count of present values in each column
present_data_counts = pd.DataFrame(df_time.count(),
                                   columns=['Present Values'])

# Create a dataframe with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df_time.columns.values):
    unique_value_counts.loc[v] = [df_time[v].nunique()]
    
# Create a dataframe with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(df_time.columns.values):    
    minimum_values.loc[v] = [df_time[v].astype(str).map(len).min()]
 
    
# Create a dataframe with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(df_time.columns.values):
    maximum_values.loc[v] = [df_time[v].astype(str).map(len).max()]
 

 # Create a dataframe with the minimum length of value in each column
minimumm_length = pd.DataFrame(columns=['Minimum Length'])
for v in list(df_time.columns.values):
    minimumm_length.loc[v] = [df_time[v].astype(str).map(len).min()]
    
    
# Create a dataframe with the maximum Length of value in each column
maximum_length = pd.DataFrame(columns=['Maximum Length'])
for v in list(df_time.columns.values):
    maximum_length.loc[v] = [df_time[v].astype(str).map(len).max()]

# Merge all the dataframes together by the index
time_data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(minimum_values).join(maximum_values).join(minimumm_length).join(maximum_length)
time_data_quality_report

In [None]:
conn.close()