Declaration of libraries

In [None]:
import pandas as pd
import numpy as np
import pyodbc
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

SQL query to gather some sample data

In [None]:
driver = '{SQL Server Native Client 11.0}'
server = 'sql2012'
db = 'og_irasv6_stage'
conn = pyodbc.connect("Driver="+driver+";Server="+server+";Database="+db+";Trusted_Connection=yes;")

# performing query to database

q1 = """set nocount on;
        select * from mop"""

q2 = """set nocount on;
        select ld1.code [material_type], ld2.code [seam_type], ld3.code [gwd_type], ld4.code [manufacturer], ps.* from pipesegment ps
        join listdomain ld1 on ps.PipeMaterialDomainId = ld1.id
        join listdomain ld2 on ps.PipeSeamDomainId = ld2.id
        join listdomain ld3 on ps.PipeGirthWeldDomainID = ld3.id
        join listdomain ld4 on ps.PipeManufacturerDomainId = ld4.id"""

mop = pd.read_sql_query(q1,conn)
ps = pd.read_sql_query(q2,conn)
ps.loc[:,'PipeInserviceDate'] =  pd.to_datetime(ps.loc[:,'PipeInserviceDate'])

conn.close()

In [None]:
mop.describe(include='all').loc[:,'DesignPressure']

In [None]:
ps.describe(include='all').loc[:,['PipeInserviceDate',
                                  'PipeOutsideDiameter',
                                  'PipeWallThickness',
                                  'PipeGrade',
                                  'PipeToughness',
                                  'material_type',
                                  'seam_type',
                                  'gwd_type',
                                  'manufacturer']]

Loading the data variable list, and filtering out the fields in order to only obtain event tables

In [None]:
dvl = pd.read_excel("OneGas Qualitative R-data-vars_2019-09-18.xlsx")

In [None]:
tbls = dvl.loc[dvl.DataConfigVariable.isnull(),['TableName','ColumnName','DataTableType']].query("DataTableType != 'Virtual' & DataTableType != 'Unknown' & ~DataTableType.isnull()")
tbls

In [None]:
tbls.loc[tbls.ColumnName.str.contains('DomainId'),'DataType'] = 'category'
# tbls.loc[tbls.DataType.isnull(),:]
# ~pd.isnull(tbls.head(1).values[:,3])
tbls

Declaration of a function that will perform a query to select all data from a table and column given, and close the connection

In [None]:
def get_table_from_IRAS(table,column):
    driver = '{SQL Server Native Client 11.0}'
    server = 'sql2012'
    db = 'og_irasv6_stage'
    conn = pyodbc.connect("Driver="+driver+";Server="+server+";Database="+db+";Trusted_Connection=yes;")

    qd = f"""set nocount on;
            select {column} as [{table}_{column}] from {table}"""
    qdf = pd.read_sql_query(qd,conn)
#     if ~pd.isnull(dtype):
#         qdf = qdf.astype('category')
#     conn.close()
    return qdf.describe(include='all')

APPLY statement is able to perform the query for every combination of table and column given from the filtered dataframe  from the step above

In [None]:
analysis = tbls.apply(lambda x: get_table_from_IRAS(x.TableName,x.ColumnName), axis=1)

Combining all results into a single dataframe for ease of analysis

In [None]:
temp = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
for x in analysis:
    temp = pd.merge(temp,x, left_index=True, right_index=True, how='outer')

In [None]:
temp

In [None]:
%ls