## Connecting and Exploring the Siyavula database on AWS

Import and install the relevant packages for connecting to AWS postgres instance and exploring the data respectively

In [1]:
#Uncomment the below statement install the postgresql-python connection utilities
#!pip install psycopg2
#!pip install numpy
#!pip install pandas

In [2]:
import psycopg2
import numpy as np
import pandas as pd

In [3]:
# Establishing connection
conn = psycopg2.connect(
    database = "siyavula-intern-final",
    user="master_db2",
    password="master1234",
    host="siyavula-postgre-db2.cnzbp4ndrpos.eu-west-1.rds.amazonaws.com",
    port="5432")

cursor=conn.cursor()

In [4]:
db_tables = pd.read_sql(
    "select table_name from information_schema.tables  where table_schema = 'public'  and table_type = 'BASE TABLE'",
    conn)
print(f'There are {db_tables.shape[0]} tables in the database')
db_tables.head()

There are 155 tables in the database


Unnamed: 0,table_name
0,access_code_redemption
1,classes
2,master_schools
3,schools
4,user_class


In [5]:
# get the number of rows in each table in db_tables
n_rows = []
for table in db_tables.table_name:
    statement = f"""
        with table1 as (
            select *
            from {table}
        )
        select count(*) from table1;
    """
    cursor.execute(statement)
    n_rows.append(cursor.fetchone()[0])

# append the numberof rows to the db_tables
db_tables['n_rows'] = n_rows
#Let's view our results
db_tables.head()

Unnamed: 0,table_name,n_rows
0,access_code_redemption,359814
1,classes,168711
2,master_schools,27790
3,schools,15452
4,user_class,3022382


In [6]:
# Let's now filter out all tables with data and return those as the ones we need to explore
db_tables_with_data = db_tables[db_tables.n_rows > 0]
# We sort the tables in descending order of the number of rows that they contain
db_tables_with_data = db_tables_with_data.sort_values('n_rows', ascending = False)
#Let's view our results
print(f'There are {db_tables_with_data.shape[0]} tables in the database with data')
db_tables_with_data.head()

There are 121 tables in the database with data


Unnamed: 0,table_name,n_rows
94,responses,108739259
105,response_mastery,86269539
11,activity_templates,27455755
104,response_attempt,25924081
84,page_view_tracking,21407656


In [7]:
table_columns = pd.read_sql(
    "select table_name, column_name, data_type from information_schema.columns where table_schema = 'public'",
    conn)
#Let's view our results
print(table_columns.shape)
table_columns.head()

(1229, 3)


Unnamed: 0,table_name,column_name,data_type
0,interactive_textbook_question,question_data,jsonb
1,users,last_login,timestamp with time zone
2,live_tutor_groups,group_uuid,uuid
3,atom_types,name,character varying
4,message_seeds,notification_content,character varying


In [8]:
# Let's merge our columns to the tables with data dataframe
db_table_columns = db_tables_with_data.merge(
    table_columns, how='left', on='table_name')
#Let's view our results
print(
    f'We have {db_table_columns.shape[0]} columns in the {db_table_columns.table_name.nunique()} tables with data to consider.')
db_table_columns.head()

We have 902 columns in the 121 tables with data to consider.


Unnamed: 0,table_name,n_rows,column_name,data_type
0,responses,108739259,random_seed,integer
1,responses,108739259,modified_at,timestamp with time zone
2,responses,108739259,correctness,boolean
3,responses,108739259,template_id,integer
4,responses,108739259,responses,jsonb


In [9]:
n_columns = db_table_columns.groupby('table_name').agg({'n_rows': 'max', 'column_name': 'count'})
n_columns['column_name'].sort_values()

table_name
alembic_version          1
exercise_counter         2
order_discounts          2
activity_assignment      2
product_commodities      2
                        ..
schools_version         16
responses               17
orders                  17
billing_transactions    19
subscriptions           19
Name: column_name, Length: 121, dtype: int64

In [10]:
# Create a excel file with table analysis results
with pd.ExcelWriter('siyavula_table_analysis.xlsx', mode='w') as writer:
    db_tables.to_excel(writer, sheet_name='Count of Data rows', index=False)
    db_tables_with_data.to_excel(writer, sheet_name='Tables with Data', index=False)
    db_table_columns.to_excel(writer, sheet_name='Tables columns with Data', index=False)

### Identifying the tables to consider

In [11]:
tables_considered = (
    'activities', 'activity_assignment', 'assignments', 'assignment_recipient', 'assignment_content',
    'books', 'chapters', 'classes', 'content_books', 'learner_activity_summary', 'projects', 
    'project_user_groups', 'master_schools', 'responses', 'response_attempt', 'response_mastery',
    'user_profile_general', 'users', 'subjects', 'templates', 'sa_schools'
)
len(tables_considered)

18

In [12]:
df = pd.read_excel('siyavula_table_analysis.xlsx', sheet_name='Tables columns with Data')
df.head()

Unnamed: 0,table_name,n_rows,column_name,data_type
0,responses,108739259,random_seed,integer
1,responses,108739259,modified_at,timestamp with time zone
2,responses,108739259,correctness,boolean
3,responses,108739259,template_id,integer
4,responses,108739259,responses,jsonb


In [13]:
for table in tables_considered:
    try:
        print(table, format(df[df.table_name == table].n_rows.values[0], ','), sep='\t')
        print(df['column_name'][df.table_name == table].to_list())
    except Exception as e:
        print(table, 'No data in table', sep='\t')
        print(pd.read_sql(
                f"select column_name from information_schema.columns where table_name = '{table}'", conn
            ).column_name.to_list())
    print('\n','-'*50, '\n' )

activities	17,808,093
['activity_type', 'content', 'uuid', 'activity_status', 'created_at', 'user_uuid', 'book_id', 'current_activity_template_id', 'current_template_response_uuid', 'modified_at']

 -------------------------------------------------- 

activity_assignment	1,030,680
['assignment_id', 'activity_uuid']

 -------------------------------------------------- 

assignments	26,726
['status', 'deleted_by_user_uuid', 'title', 'due_at', 'shortcode', 'id', 'modified_at', 'subject_id', 'created_at', 'assignment_type', 'sent_at', 'owner_uuid', 'start_at', 'deleted_reason', 'deleted_at']

 -------------------------------------------------- 

assignment_recipient	47,108
['assignment_id', 'recipient_type', 'id', 'recipient_id']

 -------------------------------------------------- 

books	72
['content_id', 'activity_type', 'subject_id', 'subject', 'curriculum', 'id', 'title', 'active', 'grade']

 -------------------------------------------------- 

chapters	634

 -------------------------

In [None]:
conn.close()