Declaration of libraries

In [55]:
import pandas as pd
import numpy as np
import pyodbc
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

SQL query to gather some sample data

In [22]:
driver = '{SQL Server Native Client 11.0}'
server = 'sql2012'
db = 'og_irasv6_stage'
conn = pyodbc.connect("Driver="+driver+";Server="+server+";Database="+db+";Trusted_Connection=yes;")

# performing query to database

q1 = """set nocount on;
        select * from mop"""

q2 = """set nocount on;
        select ld1.code [material_type], ld2.code [seam_type], ld3.code [gwd_type], ld4.code [manufacturer], ps.* from pipesegment ps
        join listdomain ld1 on ps.PipeMaterialDomainId = ld1.id
        join listdomain ld2 on ps.PipeSeamDomainId = ld2.id
        join listdomain ld3 on ps.PipeGirthWeldDomainID = ld3.id
        join listdomain ld4 on ps.PipeManufacturerDomainId = ld4.id"""

mop = pd.read_sql_query(q1,conn)
ps = pd.read_sql_query(q2,conn)
ps.loc[:,'PipeInserviceDate'] =  pd.to_datetime(ps.loc[:,'PipeInserviceDate'])

conn.close()

In [10]:
mop.describe(include='all').loc[:,'DesignPressure']

count      7204.000000
unique             NaN
top                NaN
freq               NaN
first              NaN
last               NaN
mean       5316.992000
std        4141.115604
min           6.894800
25%        3033.693200
50%        5102.120400
75%        5102.120400
max       68940.678100
Name: DesignPressure, dtype: float64

In [23]:
ps.describe(include='all').loc[:,['PipeInserviceDate',
                                  'PipeOutsideDiameter',
                                  'PipeWallThickness',
                                  'PipeGrade',
                                  'PipeToughness',
                                  'material_type',
                                  'seam_type',
                                  'gwd_type',
                                  'manufacturer']]

Unnamed: 0,PipeInserviceDate,PipeOutsideDiameter,PipeWallThickness,PipeGrade,PipeToughness,material_type,seam_type,gwd_type,manufacturer
count,0.0,1230.0,1218.0,1048.0,0.0,1230,1230,1230,1230
unique,0.0,,,,0.0,1,7,5,11
top,,,,,,Steel,Electric Resistance Weld (ERW),Unknown,Unknown
freq,,,,,,1230,724,1037,1080
first,,,,,,,,,
last,,,,,,,,,
mean,,11.193374,6.102603,308.926527,,,,,
std,,6.826422,1.635748,49.949615,,,,,
min,,1.315,2.77,206.0,,,,,
25%,,4.5,4.78,290.0,,,,,


Loading the data variable list, and filtering out the fields in order to only obtain event tables

In [47]:
dvl = pd.read_excel("OneGas Qualitative R-data-vars_2019-09-18.xlsx")

In [240]:
tbls = dvl.loc[dvl.DataConfigVariable.isnull(),['TableName','ColumnName','DataTableType']].query("DataTableType != 'Virtual' & DataTableType != 'Unknown' & ~DataTableType.isnull()")
tbls

Unnamed: 0,TableName,ColumnName,DataTableType
0,MOP,DesignPressure,Linear
1,PipeSegment,PipeOutsideDiameter,Linear
2,PipeSegment,PipeWallThickness,Linear
3,PipeSegment,PipeGrade,Linear
15,PipeSegment,PipeToughness,Linear
16,PipeSegment,PipeInserviceDate,Linear
17,PipeSegment,PipeInserviceDate,Linear
18,MAOP,MaxAllowablePressure,Linear
19,PipeSegment,PipeMaterialDomainId,Linear
20,LandUse,LandUseDomainId,Linear


In [244]:
tbls.loc[tbls.ColumnName.str.contains('DomainId'),'DataType'] = 'category'
# tbls.loc[tbls.DataType.isnull(),:]
# ~pd.isnull(tbls.head(1).values[:,3])
tbls

Unnamed: 0,TableName,ColumnName,DataTableType,DataType
0,MOP,DesignPressure,Linear,
1,PipeSegment,PipeOutsideDiameter,Linear,
2,PipeSegment,PipeWallThickness,Linear,
3,PipeSegment,PipeGrade,Linear,
15,PipeSegment,PipeToughness,Linear,
16,PipeSegment,PipeInserviceDate,Linear,
17,PipeSegment,PipeInserviceDate,Linear,
18,MAOP,MaxAllowablePressure,Linear,
19,PipeSegment,PipeMaterialDomainId,Linear,category
20,LandUse,LandUseDomainId,Linear,category


Declaration of a function that will perform a query to select all data from a table and column given, and close the connection

In [245]:
def get_table_from_IRAS(table,column):
    driver = '{SQL Server Native Client 11.0}'
    server = 'sql2012'
    db = 'og_irasv6_stage'
    conn = pyodbc.connect("Driver="+driver+";Server="+server+";Database="+db+";Trusted_Connection=yes;")

    qd = f"""set nocount on;
            select {column} as [{table}_{column}] from {table}"""
    qdf = pd.read_sql_query(qd,conn)
#     if ~pd.isnull(dtype):
#         qdf = qdf.astype('category')
#     conn.close()
    return qdf.describe(include='all')

APPLY statement is able to perform the query for every combination of table and column given from the filtered dataframe  from the step above

In [247]:
analysis = tbls.apply(lambda x: get_table_from_IRAS(x.TableName,x.ColumnName), axis=1)

Combining all results into a single dataframe for ease of analysis

In [248]:
temp = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
for x in analysis:
    temp = pd.merge(temp,x, left_index=True, right_index=True, how='outer')

In [249]:
temp

Unnamed: 0,MOP_DesignPressure,PipeSegment_PipeOutsideDiameter,PipeSegment_PipeWallThickness,PipeSegment_PipeGrade,PipeSegment_PipeToughness,PipeSegment_PipeInserviceDate_x,PipeSegment_PipeInserviceDate_y,MAOP_MaxAllowablePressure,PipeSegment_PipeMaterialDomainId,LandUse_LandUseDomainId,ClassArea_ClassAreaRatingDomainId,Coating_CoatingLineDomainId,PipeSegment_PipeManufacturerDomainId,PipeSegment_PipeSeamDomainId,OperatingData_OperatingTemperature,PipeSegment_PipeSpecificationDomainId,DamagePrevention_DmgPrevDeviceTypeDomainId,Casing_CasingTypeDomainId,CoatingCondition_CoatingCondLineFoundDomainId,Coating_CoatingInstallDate,Casing_CasingShortedFlg,PipeSegment_PipeGirthWeldDomainId,FrostArea_FrostAreaMin,EarthQuakeArea_EarthQuakeAreaRangeLD,LightningStrikeArea_LightningStrikeAreaMax,TestPostPoint_Id,OperatingData_Id,HCACFRSeg_RCReasonTypeId,HCACFRSeg_Id,InlineInspectionRange_ILIRStartDate,InlineInspectionRange_ILIRStatusDomainId,InlineInspectionFeature_ILIFFeatureNumber,InlineInspectionFeature_ILIFStatusDomainId,InlineInspectionFeature_ILIFSurfaceInd,InlineInspectionFeature_ILIFTypeDomainId,InlineInspectionFeature_ILIFFPR,InlineInspectionNonCrsnDfct_ILINCDFeatureNumber,InlineInspectionNonCrsnDfct_ILINCDTypeDomainId,HydroTestRange_HydroTestRngMinimumPressure,CPSurveyReading_Id,CPSurveyReading_CPSurveyReadingOff,CPSurveyReading_CPSurveyReadingOn,ElevationProfile_InclinationAngle,Fabrication_Id,Fabrication_FabricationTypeDomainId,CoverDepthProfile_CoverDepthReading,Incident_Id,Incident_ThreatTypeDomainId,Incident_DamageCount
25%,3033.6932,4.5,3.9624,241.31651,,,,1378.9515,7.0,104.0,64.0,113.0,5.0,8.0,21.0,6.0,,600749.0,,,0.0,10.0,0.254,600931.0,3.0,2.75,63637.75,17.0,8303.5,,100693.0,,710.0,,94.0,2.05,,100709.0,5171.0,1637894.0,872.0,1009.0,,3015.25,,1.143,1.25,700.0,
50%,5102.1204,6.625,5.969,289.57981,,,,1861.5845,7.0,106.0,603681.0,116.0,5.0,600899.0,21.0,6.0,,600749.0,,,0.0,10.0,0.508,600931.0,3.0,4.5,63638.5,17.0,66276.0,,100693.0,,710.0,,100769.0,2.06,,100763.0,7791.0,2003994.0,987.0,1184.0,,6029.5,,1.3716,1.5,700.0,
75%,5102.1204,12.75,7.112,358.52738,,,,3240.5359,7.0,106.0,603682.0,600757.0,5.0,600899.0,21.0,56.0,,600749.0,,,0.0,10.0,0.508,600931.0,4.0,6.25,63639.25,17.0,74275.5,,100693.0,,710.0,,100769.0,2.25425,,100770.0,9928.0,2370094.0,1111.0,1369.0,,9043.75,,1.7526,1.75,700.0,
count,7204.0,21145.0,20381.0,20962.0,0.0,19915,19915,5701.0,21144.0,3216.0,3125.0,17027.0,21128.0,1230.0,4.0,21141.0,0.0,973.0,0.0,10203,394.0,21144.0,1435.0,1422.0,1437.0,8.0,4.0,3553.0,31999.0,64,64.0,12572.0,12572.0,12572,12572.0,12572.0,4610.0,4610.0,11271.0,1464401.0,1464401.0,1464401.0,0.0,12058.0,0.0,4518.0,2.0,2.0,0.0
first,,,,,,1900-01-01 00:00:00,1900-01-01 00:00:00,,,,,,,,,,,,,1900-01-01 00:00:00,,,,,,,,,,2009-09-01 00:00:00,,,,,,,,,,,,,,,,,,,
freq,,,,,,874,874,,,,,,,,,,,,,959,,,,,,,,,,6,,9.0,,9797,,,9.0,,,,,,,,,,,,
last,,,,,,2018-11-30 00:00:00,2018-11-30 00:00:00,,,,,,,,,,,,,2018-11-30 00:00:00,,,,,,,,,,2018-10-04 00:00:00,,,,,,,,,,,,,,,,,,,
max,68940.6781,24.0,16.6624,448.15922,,,,68940.6781,10147.0,110.0,603684.0,601001.0,600880.0,600907.0,21.0,600888.0,,600750.0,,,1.0,600915.0,0.762,600931.0,6.0,8.0,63640.0,17.0,82275.0,,100693.0,,710.0,,100769.0,8.236,,100847.0,697590.8583,2736194.0,2076.0,4233.0,,12058.0,,12.7,2.0,700.0,
mean,5316.992,8.553846,5.918378,279.911523,,,,2374.195944,284.670261,105.698694,365904.63968,184835.100487,44996.739114,445058.707317,21.0,1413.051795,,600748.849949,,,0.180203,8567.142215,0.398877,600931.0,3.256785,4.5,63638.5,16.637771,48349.924404,,80244.8125,,710.0,,75532.409959,2.297187,,100746.541866,6718.731047,2003741.0,893.9413,1112.221,,6029.5,,1.500697,1.5,700.0,
min,6.8948,1.315,2.7686,165.47418,,,,482.633,7.0,103.0,63.0,14.0,5.0,8.0,21.0,6.0,,600745.0,,,0.0,10.0,0.0,600931.0,2.0,1.0,63637.0,0.0,288.0,,25.0,,710.0,,84.0,1.39,,100691.0,-68941.0,173461.0,-83.0,-436.0,,1.0,,0.0,1.0,700.0,


In [250]:
%ls

 Volume in drive Z is Projects
 Volume Serial Number is BC71-F23D

 Directory of Z:\ONEGAS\2019-08_IRAS Implementation\3_Engineering\Results & QC

2019-09-23  01:29 PM    <DIR>          .
2019-09-23  01:29 PM    <DIR>          ..
2019-09-18  02:11 PM    <DIR>          .ipynb_checkpoints
2019-09-10  10:41 AM    <DIR>          Data QC
2019-09-23  01:29 PM           107,833 ONEGAS Data Check.ipynb
2019-09-18  02:04 PM            41,481 OneGas Qualitative R-data-vars_2019-09-18.xlsx
2019-09-18  01:45 PM    <DIR>          Risk Projects
               2 File(s)        149,314 bytes
               5 Dir(s)  81,012,285,440 bytes free
