# Import Statements

In [1]:
import pandas as pd
import pyodbc
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import sqlite3

from IPython.display import display, Markdown


pd.set_option("display.max_columns", 999)
pd.set_option('display.max_rows', 500)
pd.set_option("display.expand_frame_repr",True)
pd.set_option("display.max_colwidth", 100)

In [15]:
folder = r"C:\Users\armando_borjas\Documents\TMC_20201023_QAQC"
%cd $folder
%ls

C:\Users\armando_borjas\Documents\TMC_20201023_QAQC
 Volume in drive C has no label.
 Volume Serial Number is FE48-FC0C

 Directory of C:\Users\armando_borjas\Documents\TMC_20201023_QAQC

2020-10-27  11:36 AM    <DIR>          .
2020-10-27  11:36 AM    <DIR>          ..
2020-10-26  02:24 PM         2,100,576 Class Location_Empty.csv
2020-10-26  02:25 PM               421 Class Location_Empty_Lines.csv
2020-10-26  02:18 PM            81,611 Land Use_Empty.csv
2020-10-26  01:32 PM               353 Maximum Allowable Operating Pressure (kPa)_Empty.csv
2020-10-26  01:32 PM               337 Pipe Outside Diameter (in)_Empty.csv
2020-10-26  01:33 PM               335 Pipe Wall Thickness (mm)_Empty.csv
2020-10-26  01:25 PM               327 PipeGrade_Empty.csv
2020-10-26  04:09 PM            85,562 Resident Damage Change.png
2020-10-26  01:40 PM         7,043,456 SafetyPopImpactAnalysis_Empty.csv
2020-10-26  02:47 PM               465 SafetyPopImpactAnalysis_Empty_Lines.csv
2020-10-26  03:14 

# Functions

In [16]:
def pct_error(col1,col2):
    return abs(col1 - col2)*100./col2

In [30]:
def sqlite_sql(q, path=r"TMC_20201023T1600_system.db"):

    # Read sqlite query results into a pandas DataFrame
    conn = sqlite3.connect(path)
    df = pd.read_sql_query(q, conn)

    conn.close()
    return df

In [18]:
def sqlserver_sql(q, server = 'SQL2012', db = 'TMC_IRASV6_STAGE'):
    driver = '{SQL Server Native Client 11.0}'
    conn = pyodbc.connect("Driver="+driver+";Server="+server+";Database="+db+";Trusted_Connection=yes;")

    # performing query to database
    df = pd.read_sql_query(q,conn)

    conn.close()
    return df


def oracle_sql(q,user=r'ATCO_IRASV5_STAGE', password='atco', dsn='ORA12C'):
    conn = cx_Oracle.connect(user, password, dsn)

    t_data = pd.read_sql_query(q,conn)

    conn.close()
    
    return t_data

In [19]:
def get_ili_ranges(line):
    q1 = f"""set nocount on;
            select ll.LineName, ld.code [status], r.* from InlineInspectionRange r
            join StationSeries ss on r.BeginStationSeriesId = ss.Id
            join LineLoop ll on ss.LineLoopId = ll.Id
            join ListDomain ld on r.ILIRStatusDomainId = ld.Id
            where ll.LineName like '%{line}%'
            order by r.ILIRStartDate desc"""

    return sqlserver_sql(q1)

In [20]:
def build_df(od_i, wt_mm, grade_mpa, maop_kpa, installdate, ILIdate, pdf, lengthmm, create=True, **kwargs):
    if create:
        temp_dict = dict(OD_inch=[od_i],
                        WT_mm=[wt_mm],
                        grade_MPa=[grade_mpa],
                        install_date=[installdate],
                        MAOP_kPa=[maop_kpa],
                        ILIRStartDate=[ILIdate],
                        depth_fraction=[pdf],
                        length_mm=[lengthmm]
                        )
            
        return pd.DataFrame(temp_dict)
    else:
        temp_df = pd.DataFrame(dict(OD_inch=[od_i],
                        WT_mm=[wt_mm],
                        grade_MPa=[grade_mpa],
                        install_date=[installdate],
                        MAOP_kPa=[maop_kpa],
                        ILIRStartDate=[ILIdate],
                        depth_fraction=[pdf],
                        length_mm=[lengthmm]
                        ))
        return kwargs['df'].append(temp_df)

In [21]:
def get_features_for_poe(ILIRID, start=1, amt=999999999):
    query = f"""select
            c.RN,
            ll.linename [line],

            ld4.code [vendor], 

            format(r.ILIRStartDate,'yyyy-MM-dd') [ILIRStartDate],
            ld3.code [tool],

            f.ILIFFeatureNumber [FeatureID],
            ld.code [status], 
            ld2.code [type],
            (f.StationNum*mlv.MultiplierNum+mlv.FactorNum) [chainage_m], 
            f.ILIFSurfaceInd, 
            f.ILIFPeakDepthPct [depth_fraction], 
            f.ILIFLength [length_mm], 
            f.ILIFWidth [width_mm], 

            format(a.PipeInserviceDate,'yyyy-MM-dd') [install_date],
            a.PipeOutsideDiameter [OD_inch],
            a.PipeWallThickness [WT_mm],
            a.PipeGrade [grade_MPa],
            a.PipeToughness [toughness_J],
            a.[begin_ps_c],
            a.[end_ps_c],

            b.MAOP_kPa,
            b.begin_maop_c,
            b.end_maop_c  

            from InlineInspectionFeature f

            left join ListDomain ld on f.ILIFStatusDomainId = ld.Id
            left join ListDomain ld2 on f.ILIFTypeDomainId = ld2.Id
            left join StationSeries ss on f.StationSeriesId = ss.id
            left join LineLoop ll on ss.LineLoopId = ll.Id
            left join inlineinspectionrange r on f.inlineinspectionrangeid = r.id
            left join ListDomain ld3 on r.ILIRToolDomainId = ld3.Id
            left join InlineInspection i on f.InlineInspectionId = i.Id
            left join ListDomain ld4 on i.ILICompanyDomainId = ld4.Id
            left join MLVCorrection mlv on f.StationSeriesId = mlv.StationSeriesId

            left join 
                (select ll.id [LinloopId],
                ll.LineName,
                ps.EffectiveStartDate,
                ps.PipeInserviceDate,
                ps.PipeOutsideDiameter,
                ps.PipeWallThickness,
                ps.PipeGrade,
                ps.PipeToughness,
                (ps.BeginStationNum*mlv1.MultiplierNum+mlv1.FactorNum) [begin_ps_c],
                (ps.EndStationNum*mlv2.MultiplierNum+mlv2.FactorNum) [end_ps_c]
                from PipeSegment ps
                join StationSeries ss on ps.BeginStationSeriesId = ss.id
                join LineLoop ll on ss.LineLoopId = ll.Id
                join MLVCorrection mlv1 on ps.BeginStationSeriesId = mlv1.StationSeriesId
                join MLVCorrection mlv2 on ps.EndStationSeriesId = mlv2.StationSeriesId
                where ps.EffectiveEndDate is null
                ) a on ((f.StationNum*mlv.MultiplierNum+mlv.FactorNum) between a.[begin_ps_c] and a.[end_ps_c]) and a.LinloopId = ll.id

            left join 
                (select ll.id [LinloopId],
                ll.LineName,
                maop.EffectiveEndDate,
                maop.BeginStationSeriesId,
                maop.MaxAllowablePressure [MAOP_kPa],
                maop.BeginStationNum*mlv1.MultiplierNum+mlv1.FactorNum [begin_maop_c],
                maop.EndStationNum*mlv2.MultiplierNum+mlv2.FactorNum [end_maop_c]
                from maop maop
                join StationSeries ss on ss.id = maop.BeginStationSeriesId
                join LineLoop ll on ss.LineLoopId = ll.Id
                join MLVCorrection mlv1 on maop.BeginStationSeriesId = mlv1.StationSeriesId
                join MLVCorrection mlv2 on maop.EndStationSeriesId = mlv2.StationSeriesId
                where maop.EffectiveEndDate is null
                ) b on ((f.StationNum*mlv.MultiplierNum+mlv.FactorNum) between b.[begin_maop_c] and b.[end_maop_c]) and b.[LinloopId] = ll.id 

            left join
                (select ROW_NUMBER() over(partition by ff.InlineInspectionRangeId order by ff.id asc)  [RN],
                ff.id,
                ff.InlineInspectionRangeId
                from InlineInspectionFeature ff
                )  c on f.Id = c.Id and f.InlineInspectionRangeId = c.InlineInspectionRangeId

            where f.InlineInspectionRangeId = {ILIRID} and (c.RN between {start} and {start+amt})
            order by chainage_m asc"""
    
    return sqlserver_sql(query)

In [22]:
def check_table_overlap(table, user=r'ATCO_IRASV5_STAGE', password='atco', dsn='ORA12C'):
    # query for table to check for overlaps
    q1 = f"""set nocount on;
            select e.Id [RecordId],
            (mlv1.MultiplierNum*e.BeginStationNum+mlv1.FactorNum) [eBeginChainage],
            (mlv2.MultiplierNum*e.EndStationNum+mlv2.FactorNum) [eEndChainage], e.* from {table} e
            join MlvCorrection mlv1 on mlv1.StationSeriesId = e.BeginstationSeriesId
            join MlVcorrection mlv2 on mlv2.StationseriesId = e.EndStationSeriesId"""

    df2 = oracle_sql(q1, user=user, password=password, dsn=dsn)

    # sorting in increasing stationseriesid and chainage
    df2 = df2.sort_values(by=['BeginStationSeriesId','eBeginChainage']).reset_index(drop=True)

    # creating subset containing only the chainages
#     ss_df2 = df2[['BeginStationSeriesId','eBeginChainage','EndStationSeriesId','eEndChainage']]

    # following pandas statement displays the subtraction from one record's endchainage to the next records beginchainage
#     ss_df2.groupby(['BeginStationSeriesId','EndStationSeriesId']).apply(lambda x: x['eBeginChainage'].shift(-1)-x['eEndChainage']).fillna(0.00)

    # following statements filter out anything that doesn't have an overlap
#     ss_df2_overlaps = ss_df2.groupby(['BeginStationSeriesId','EndStationSeriesId']).apply(lambda x: x['eBeginChainage'].shift(-1)-x['eEndChainage']).reset_index(name='record_diff').fillna(0.00)
    
    df2 = df2.join(df2.groupby(['BeginStationSeriesId','EndStationSeriesId']).apply(lambda x:  x['eBeginChainage'].shift(-1)-x['eEndChainage']).reset_index(name='record_diff').fillna(0.00).record_diff)
    return df2[df2.columns.values[[0,1,2,-1]+[x for x in range(3,len(df2.columns)-1)]]]#.query("record_diff != 0.")

In [23]:
def check_table_span(table,  user=r'ATCO_IRASV5_STAGE', password='atco', dsn='ORA12C'):
    # query for the table to check if records fall within bounds of stationseries
    q2 = f"""set nocount on;
            select ss.lineloopid [LineLoopId],
            (mlv1.MultiplierNum*e.BeginStationNum+mlv1.FactorNum) [eBeginChainage],
            (mlv2.MultiplierNum*e.EndStationNum+mlv2.FactorNum) [eEndChainage], e.* from {table} e
            join MlvCorrection mlv1 on mlv1.StationSeriesId = e.BeginstationSeriesId
            join MlVcorrection mlv2 on mlv2.StationseriesId = e.EndStationSeriesId
            join stationseries ss on e.Beginstationseriesid = ss.id
            """

    q3 = f"""set nocount on;
            select 
            (mlv.MultiplierNum*ss.BeginStationNum+mlv.FactorNum) [sBeginChainage],
            (mlv.MultiplierNum*ss.EndStationNum+mlv.FactorNum) [sEndChainage], ss.* from stationseries ss
            join MlvCorrection mlv on mlv.StationSeriesId = ss.id
            """

    df3 = oracle_sql(q2, user=user, password=password, dsn=dsn)
    df4 = oracle_sql(q3, user=user, password=password, dsn=dsn)

    # this next statement just aggregates the event table data and shows the span of the data. DOESN'T PICK OUT GAPS.
    # df3.groupby("LineLoopId").agg({'eBeginChainage':'min','eEndChainage':'max'})


    return df4.groupby("LineLoopId").agg({'sBeginChainage':'min','sEndChainage':'max'}).join(df3.groupby("LineLoopId").agg({'eBeginChainage':'min','eEndChainage':'max'})).assign(delta_end = lambda x: x.sEndChainage-x.eEndChainage,
                                                                                                                                                                                    delta_begin = lambda x: x.sBeginChainage-x.eBeginChainage).query("(delta_end!=0.) | (delta_begin!=0.)")

In [24]:
def impact_fault_tree(b1,b2,b3,b4,b5,b7,b9,b10,b11,b12,b6=0.40,b8=0.97):
    res = b1 * b12 * (1 - ((1-(1 - ((1-(b7 * b8 * (1 - ((1-(b2 * b3 * b4)) * (1-(b5 * b6)))))) * (1-(b4 * (1 - ((1-(b6 * b9)) * (1-(b6 * b10))))))))) * (1-b11)))
    return res

In [25]:
def dfstats(x):
    display(Markdown("# Object Variable Data Summary"))
    return pd.concat([results.describe(include='all').T, results.dtypes.rename("type")], axis=1).loc[eval(x)].sort_values("type")

In [26]:
def return_aggregate_empty(df, column, text=None):
    display(Markdown(f"{column}: - {text}"))
    temp =  df[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)',column]]\
    .assign(emptyLength=lambda x: np.where(x[column].isnull(),x['Length (m)'],0.00))
    
    return temp, temp.groupby("Line Name").sum()[['Length (m)','emptyLength']].query('emptyLength > 0')

# Data QC

In [39]:
q1 = """
        select 
        ld.code [type],a.count_r,
        s.*
        from Structure s
        join listdomain ld on s.structuretypedomainid = ld.id
        left join (select
                    sv.StructureId, count(sv.StructureVertexSeq) as [count_r]
                    from StructureVertex sv
                    group by sv.StructureId) a on a.StructureId = s.id"""

iras = sqlserver_sql(q1)

q2 = """
        select
        OBJECTID,structuretype,ICBO_structuretype
        from P_STRUCTUREFOOTPRINT s"""

updm = sqlserver_sql(q2, server='sql2014', db='TMC_UPDM_SOURCE')

q3 = """
        select
        OBJECTID,identifiedsitetype,ICBO_structuretype
        from P_OUTSIDEAREA s"""

updm2 = sqlserver_sql(q3, server='sql2014', db='TMC_UPDM_SOURCE')

In [42]:
display(iras.loc[lambda x: x.count_r.isnull(),:].type.value_counts())

Garage / Shed             10473
Home                       5368
Condominium-Townhouse       376
Plant                       169
Factory                     103
Business-Urban              100
Warehouse                    84
Playground                   48
Business-Rural               41
Park                         38
Golf Course                  17
Church-Urban                 13
Gas Station                  12
Campground                   12
Motel                        12
Office                       12
Community Center-Urban       11
Strip Mall                    9
Lumberyard                    8
School-Urban                  7
Sports Field                  6
Rest Area                     6
Apartment                     5
Runway                        3
School-Rural                  2
Recreation                    2
Arena-Rural                   2
Cemetery                      2
Fire Hall                     2
Restaurant-Rural              2
Rodeo Grounds                 1
Restaura

## SQLite Data QC 

In [31]:
q = """SELECT name FROM sqlite_master
    WHERE type='table'
    ORDER BY name;"""

sqlite_sql(q)

Unnamed: 0,name
0,Pipeline_ME-SAF
1,Summary_TMC_20201023T1600_system_Default_Summary


In [32]:
df = sqlite_sql("select * from 'Pipeline_ME-SAF'")

In [20]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249953 entries, 0 to 249952
Columns: 138 entries, Corporation to _Matrix_Row_Label
dtypes: float64(89), int64(1), object(48)
memory usage: 807.1 MB


In [27]:
pd.DataFrame(df.columns, columns=["col"]).merge(df.dtypes.rename("dtype"), left_on='col', right_index=True)

Unnamed: 0,col,dtype
0,Corporation,object
1,Operating Unit,object
2,District Division,object
3,Pipeline System,object
4,Line Name,object
5,LineID,int64
6,GridStartMeasure,float64
7,GridEndMeasure,float64
8,Begin Measure (m),float64
9,End Measure (m),float64


### Numeric Data Types 

In [31]:
numerics = df.select_dtypes(include=[np.int64, np.float64]).columns.values

In [33]:
df[numerics].info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249953 entries, 0 to 249952
Data columns (total 90 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   LineID                                                 249953 non-null  int64  
 1   GridStartMeasure                                       249953 non-null  float64
 2   GridEndMeasure                                         249953 non-null  float64
 3   Begin Measure (m)                                      249953 non-null  float64
 4   End Measure (m)                                        249953 non-null  float64
 5   Length (m)                                             249953 non-null  float64
 6   Mechanical Damage Failure Frequency perkmyr            249953 non-null  float64
 7   MD Failure Frequency perkmyr                           249953 non-null  float64
 8   Mechanical Damage Immediate Failur

Checking the following columns:
   1. Grade
   2. MAOP
   3. OD
   4. WT
   5. Safety Pop Impact Batched1 (-)
   6. Safety Pop Impact Batched2 (-)
   7. Safety Pop Impact Unbatched (-)

In [86]:
idx = pd.IndexSlice
field = 'Pipe Grade (MPa)'
df[np.append(['Line Name'], numerics)].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field]#.to_csv("PipeGrade_Empty")

In [101]:
idx = pd.IndexSlice
field = 'Maximum Allowable Operating Pressure (kPa)'
df[np.append(['Line Name'], numerics)].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field]#.to_csv(f"{field}_Empty.csv")

In [104]:
idx = pd.IndexSlice
field = 'Pipe Outside Diameter (in)'
df[np.append(['Line Name'], numerics)].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field].to_csv(f"{field}_Empty.csv")

In [106]:
idx = pd.IndexSlice
field = 'Pipe Wall Thickness (mm)'
df[np.append(['Line Name'], numerics)].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field].to_csv(f"{field}_Empty.csv")

In [191]:
idx = pd.IndexSlice
field = 'Safety Pop Impact Batched1 (-)'
field2 = 'Safety Pop Impact Batched2 (-)'
field3 = 'Safety Pop Impact Unbatched (-)'
df[np.append(['Line Name'], numerics)].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull() & `{field2}`.isnull()  & `{field3}`.isnull()")[[field,field2,field3]].reset_index()['Line Name'].value_counts()#.to_csv(f"SafetyPopImpactAnalysis_LinesEmpty.csv")

TMPL HARGREAVES-DARFIELD 24IN KM    46124
TMPL EDMONTON-EDSON 24IN KM         32576
TMPL ANCHOR LOOP 36IN KM            20095
TMPL SUMAS-BURNABY 24IN KM          15486
TMPL EDSON-HINTON 30IN KM           12702
TMPL SUMAS-SUMASTANKFARM 24IN KM     1090
PUGET SUMAS-BORDER 24IN KM            644
TMPL PETROCAN PRODUCTS 20IN KM         24
TMPL HINTON-HARGREAVES 24IN KM          1
Name: Line Name, dtype: int64

In [106]:
idx = pd.IndexSlice
field = 'Pipe Wall Thickness (mm)'
df[np.append(['Line Name'], numerics)].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field].to_csv(f"{field}_Empty.csv")

### Object Data Types

In [147]:
objects = df.select_dtypes(include=[np.object]).columns.values
objects = np.append(['Begin Measure (m)', 'End Measure (m)', 'Length (m)'], objects)

In [183]:
df[objects].info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249953 entries, 0 to 249952
Data columns (total 51 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   Begin Measure (m)                                 249953 non-null  float64
 1   End Measure (m)                                   249953 non-null  float64
 2   Length (m)                                        249953 non-null  float64
 3   Corporation                                       249953 non-null  object 
 4   Operating Unit                                    249953 non-null  object 
 5   District Division                                 249953 non-null  object 
 6   Pipeline System                                   249953 non-null  object 
 7   Line Name                                         249953 non-null  object 
 8   Sleeve Type                                       1943 non-null    object 
 9   Casi

Checking the following columns:
   1. Sleeve Type
   2. Land Use
   3. Class Location

In [158]:
idx = pd.IndexSlice
field = 'Sleeve Type'
df[objects].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"~`{field}`.isnull()")[field].to_csv(f"{field}_NonEmpty")

In [178]:
df[field].value_counts().to_frame().stack().to_frame().reorder_levels([1,0]).rename(columns={0:f"count"})

Unnamed: 0,Unnamed: 1,count
Sleeve Type,Unknown,1231
Sleeve Type,Petrosleeve,476
Sleeve Type,Composite Sleeve,123
Sleeve Type,TYPE B,113


In [181]:
idx = pd.IndexSlice
field = 'Land Use'
df[objects].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field].to_csv(f"{field}_Empty.csv")

In [182]:
df[field].value_counts().to_frame().stack().to_frame().reorder_levels([1,0]).rename(columns={0:f"count"})

Unnamed: 0,Unnamed: 1,count
Land Use,FOREST,154638
Land Use,AGRICULTURAL AREAS,43707
Land Use,DEVELOPED,39070
Land Use,UNDEVELOPED NATIVE PRAIRIE,11005


In [185]:
idx = pd.IndexSlice
field = 'Class Location'
df[objects].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[field].to_csv(f"{field}_Empty.csv")

In [187]:
idx = pd.IndexSlice
df[objects].set_index(['Line Name', 'Begin Measure (m)', 'End Measure (m)'])\
.query(f"`{field}`.isnull()")[[field]].reset_index()['Line Name'].value_counts().to_csv(f"{field}_Empty_Lines.csv")

In [188]:
df[field].value_counts().to_frame().stack().to_frame().reorder_levels([1,0]).rename(columns={0:f"count"})

Unnamed: 0,Unnamed: 1,count
Class Location,1,168036
Class Location,2,25607
Class Location,3,16044
Class Location,4,1730


# Risk Results Review

In [65]:
for x in results['Line Name'].unique():
    print(f'-\t{x}')

-	PUGET BORDER-BURLINGTON 20IN KM
-	PUGET BURLINGTON-ANACORTES 16IN KM
-	PUGET LAUREL-FERNDALE 16IN KM
-	PUGET SUMAS-BORDER 24IN KM
-	TMPL ANCHOR LOOP 36IN KM
-	TMPL BURNABY-WESTRIDGE 24IN KM
-	TMPL DARFIELD-BLACK PINES 24IN
-	TMPL DARFIELD-KAMLOOPS 30IN KM
-	TMPL EDMONTON-EDSON 24IN KM
-	TMPL EDSON-HINTON 30IN KM
-	TMPL HARGREAVES-DARFIELD 24IN KM
-	TMPL HINTON-HARGREAVES 24IN KM
-	TMPL KAMLOOPS-SUMAS 24IN KM
-	TMPL PETROCAN PRODUCTS 20IN KM
-	TMPL SUMAS-BURNABY 24IN KM
-	TMPL SUMAS-SUMASTANKFARM 20IN KM
-	TMPL SUMAS-SUMASTANKFARM 24IN KM


## QC EC IC SAF 

In [11]:
%%time
results = pd.read_csv(r"Pipeline_EC_IC_SAF.csv")

# results = pd.read_excel(r"",skiprows=3, header=0)
results.info(memory_usage='deep')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230855 entries, 0 to 230854
Data columns (total 81 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Pipeline System                               230855 non-null  object 
 1   Line Name                                     230855 non-null  object 
 2   LineID                                        230855 non-null  int64  
 3   GridStartMeasure                              230855 non-null  float64
 4   GridEndMeasure                                230855 non-null  float64
 5   Begin Measure (m)                             230855 non-null  float64
 6   End Measure (m)                               230855 non-null  float64
 7   Length (m)                                    230855 non-null  float64
 8   External Corrosion Failure Frequency perkmyr  230855 non-null  float64
 9   Sleeve Type                                   18

In [12]:
(100.00 - results.describe(include='all').loc['count',:][lambda x: x < 230855]*100.00/230855).sort_values()

Land Use                                  0.6268
POE Date of 2nd Last ILI MFL             11.1421
Mixture Density Batch 2 lbperft3         15.4144
Safety Score Batch1 Rupture              15.4144
Hazard Area Batch2 Rupture sqft          15.4144
Hazard Area Batch1 Rupture sqft          15.4144
Hazard Impact Batch1 Rupture             15.4144
Release Mode Batch2 Rupture              15.4144
Release Mode Batch1 Rupture              15.4144
Release Rate Batch2 Rupture lbpersec     15.4144
Release Rate Batch1 Rupture lbpersec     15.4144
Batch 1 Time Pct                         15.4144
Mixture Density Batch 1 lbperft3         15.4144
Safety Score Batch2 Rupture              15.4144
Batch Product 1                          15.4144
Batch Product 2                          15.4144
SAF Batch 1 Time percentage (-)          15.4144
Batched Product                          15.4144
Hazard Impact Batch2 Rupture             15.4144
Batch Product Flag                       15.4144
EC POE Rupture peryr

In [13]:
display(results.columns.to_frame().rename(columns={0:'cols'}).query("cols.str.contains('')").sort_values(by='cols'))
# results.drop(columns=results.columns.to_frame().rename(columns={0:'cols'}).query("cols.str.contains('.\d$')").cols.values, inplace=True)

Unnamed: 0,cols
Batch 1 Time Pct,Batch 1 Time Pct
Batch Product 1,Batch Product 1
Batch Product 2,Batch Product 2
Batch Product Flag,Batch Product Flag
Batched Product,Batched Product
Begin Measure (m),Begin Measure (m)
CNW2 Count,CNW2 Count
DWA2 Count,DWA2 Count
Date of last ILI MFL,Date of last ILI MFL
EC ILI Failure Frequency perkmyr,EC ILI Failure Frequency perkmyr


In [31]:
dfstats("lambda x: (x['count'] > 0) & (x['type']=='float64') > 0")

# Object Variable Data Summary

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,type
GridStartMeasure,230855,,,,580780.0,334180.0,0.0,329850.0,567857.0,875681.0,1147280.0,float64
Mixture Density Batch 1 lbperft3,195270,,,,39.079,2.13163e-14,39.079,39.079,39.079,39.079,39.079,float64
Mixture Density Batch 2 lbperft3,195270,,,,48.383,2.84218e-14,48.383,48.383,48.383,48.383,48.383,float64
Mixture Density lbperft3,35585,,,,41.813,4.67962,39.079,39.079,39.079,48.383,56.187,float64
Release Rate Batch1 Rupture lbpersec,195270,,,,44225.9,18730.6,28068.6,34954.5,36597.1,41436.4,104779.0,float64
Release Rate Batch2 Rupture lbpersec,195270,,,,49209.8,20841.4,31231.7,38893.6,40721.3,46105.9,116587.0,float64
Hazard Area sqft,35585,,,,1977680.0,975813.0,280564.0,491313.0,2561720.0,2610610.0,2824640.0,float64
Population Density persqmi,230855,,,,1166.9,1448.11,15.2,15.2,15.2,3176.0,3176.0,float64
HPA2 Count,230855,,,,0.498321,0.754789,0.0,0.0,0.0,1.0,2.0,float64
DWA2 Count,230855,,,,1.32843,0.765352,0.0,1.0,2.0,2.0,2.0,float64


In [13]:
display(Markdown("# Empty Land Use"))
results.loc[lambda x: x['Land Use'].isnull(),['Line Name','Begin Measure (m)','End Measure (m)','Length (m)','Land Use']]\
.groupby('Line Name').agg(['sum','count'])['Length (m)'].sort_values('sum')

# Empty Land Use

Unnamed: 0_level_0,sum,count
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL SUMAS-SUMASTANKFARM 20IN KM,38.509,11
TMPL SUMAS-SUMASTANKFARM 24IN KM,38.639,12
TMPL BURNABY-WESTRIDGE 24IN KM,42.783,11
PUGET LAUREL-FERNDALE 16IN KM,58.974,9
PUGET BORDER-BURLINGTON 20IN KM,89.818,4
PUGET BURLINGTON-ANACORTES 16IN KM,206.491,17
TMPL ANCHOR LOOP 36IN KM,472.272,117
TMPL DARFIELD-KAMLOOPS 30IN KM,605.162,105
TMPL SUMAS-BURNABY 24IN KM,726.575,113
TMPL EDMONTON-EDSON 24IN KM,730.486,119


In [14]:
(100.00 - results.describe(include='all').loc['count',:][lambda x: x == 230855]*100.00/230855).sort_values()

Pipeline System                                 0
Internal ILI Feature Count dynseg               0
IC ILI Failure Frequency perkmyr                0
Impact on Population Rupture (50%)              0
Product Type                                    0
Population Density persqmi                      0
HPA2 Count                                      0
DWA2 Count                                      0
ESA2 Count                                      0
OPA2 Count                                      0
CNW2 Count                                      0
USA2 Count                                      0
NCA2 Count                                      0
Population Density HCA persqmi                  0
Population Density Land Use persqmi             0
Matrix_Total_Probability                        0
Matrix_Total_Consequence                        0
_Matrix_Legend_Label                            0
Internal ILI Feature Count                      0
Internal Corrosion Failure Frequency perkmyr    0


In [15]:
display(Markdown("# Check on PipeSegment"))
results[['Pipe Wall Thickness (mm)','Pipe Outside Diameter (mm)','Pipe Grade (MPa)','Pipe Install Date']]\
.assign(date = lambda x: pd.to_datetime(x['Pipe Install Date'])).describe(include='all')

# Check on PipeSegment

Unnamed: 0,Pipe Wall Thickness (mm),Pipe Outside Diameter (mm),Pipe Grade (MPa),Pipe Install Date,date
count,230855.0,230855.0,230855.0,230855,230855
unique,,,,39,39
top,,,,1953-01-01 00:00:00,1953-01-01 00:00:00
freq,,,,172615,172615
first,,,,,1953-01-01 00:00:00
last,,,,,2019-01-01 00:00:00
mean,8.467732,650.62553,369.125079,,
std,1.712191,98.894199,36.905406,,
min,6.35,406.4,289.0,,
25%,7.92,609.6,359.0,,


In [16]:
display(Markdown('# Following segments have MAOP > 80% of Pressure @ SMYS'))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)','Pipe Wall Thickness (mm)','Pipe Outside Diameter (mm)','Pipe Grade (MPa)','Maximum Allowable Operating Pressure (kPa)']]\
.assign(equiv_press = lambda x: np.round((2000.*x['Pipe Grade (MPa)']*x['Pipe Wall Thickness (mm)']*0.8)/x['Pipe Outside Diameter (mm)'], 3))\
.query("equiv_press < `Maximum Allowable Operating Pressure (kPa)`")

# Following segments have MAOP > 80% of Pressure @ SMYS

Unnamed: 0,Line Name,Begin Measure (m),End Measure (m),Length (m),Pipe Wall Thickness (mm),Pipe Outside Diameter (mm),Pipe Grade (MPa),Maximum Allowable Operating Pressure (kPa),equiv_press
4772,PUGET BURLINGTON-ANACORTES 16IN KM,14529.725,14530.412,0.687,6.35,406.4,289.0,7694.552,7225.0
4773,PUGET BURLINGTON-ANACORTES 16IN KM,14530.412,14530.974,0.562,6.35,406.4,289.0,7694.552,7225.0
4795,PUGET BURLINGTON-ANACORTES 16IN KM,14556.845,14564.558,7.713,6.35,406.4,289.0,7676.626,7225.0
4796,PUGET BURLINGTON-ANACORTES 16IN KM,14564.558,14567.873,3.315,6.35,406.4,289.0,7676.626,7225.0
92268,TMPL EDSON-HINTON 30IN KM,311983.987,311984.889,0.902,9.8,762.0,482.0,9930.0,9918.32
92269,TMPL EDSON-HINTON 30IN KM,311984.889,311990.0,5.111,9.8,762.0,482.0,9930.0,9918.32
92270,TMPL EDSON-HINTON 30IN KM,311990.0,311990.6,0.6,9.8,762.0,482.0,9930.0,9918.32
92271,TMPL EDSON-HINTON 30IN KM,311990.6,311995.703,5.103,9.8,762.0,482.0,9930.0,9918.32
92272,TMPL EDSON-HINTON 30IN KM,311995.703,312001.213,5.51,9.8,762.0,482.0,9919.073,9918.32
213244,TMPL PETROCAN PRODUCTS 20IN KM,6.68,10.274,3.594,9.52,508.0,359.0,10764.347,10764.346


In [17]:
display(Markdown("# ILI Check - segments with ILI date older than pipe install date"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['Date of last ILI MFL','Pipe Install Date']].apply(lambda x: pd.to_datetime(x)).query("`Date of last ILI MFL`<=`Pipe Install Date`"), how='inner')\
.groupby("Line Name").agg(['sum','count'])

# ILI Check - segments with ILI date older than pipe install date

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m)
Unnamed: 0_level_1,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
TMPL EDMONTON-EDSON 24IN KM,3310739.934,21,3310849.787,21,109.853,21
TMPL PETROCAN PRODUCTS 20IN KM,27754.194,42,28545.09,42,790.896,42


In [31]:
display(Markdown("# HCA datasets"))
results[['Line Name','HPA2 Count','DWA2 Count','ESA2 Count','OPA2 Count','CNW2 Count','USA2 Count','NCA2 Count']].groupby('Line Name').sum()#.any()

# HCA datasets

Unnamed: 0_level_0,HPA2 Count,DWA2 Count,ESA2 Count,OPA2 Count,CNW2 Count,USA2 Count,NCA2 Count
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PUGET BORDER-BURLINGTON 20IN KM,1880.0,4405.0,0.0,4552.0,0.0,0.0,0.0
PUGET BURLINGTON-ANACORTES 16IN KM,128.0,309.0,0.0,514.0,100.0,0.0,0.0
PUGET LAUREL-FERNDALE 16IN KM,284.0,1700.0,0.0,2264.0,0.0,0.0,0.0
PUGET SUMAS-BORDER 24IN KM,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TMPL ANCHOR LOOP 36IN KM,443.0,12260.0,0.0,253.0,0.0,0.0,0.0
TMPL BURNABY-WESTRIDGE 24IN KM,1892.0,1496.0,0.0,0.0,405.0,0.0,0.0
TMPL DARFIELD-BLACK PINES 24IN,2213.0,8113.0,0.0,1011.0,0.0,0.0,0.0
TMPL DARFIELD-KAMLOOPS 30IN KM,15250.0,21549.0,0.0,2648.0,0.0,0.0,0.0
TMPL EDMONTON-EDSON 24IN KM,17728.0,39472.0,0.0,17681.0,0.0,0.0,0.0
TMPL EDSON-HINTON 30IN KM,5772.0,13406.0,0.0,3421.0,0.0,0.0,0.0


In [36]:
results[['SAF Batch 1 Time percentage (-)','Batched Product']].count()

SAF Batch 1 Time percentage (-)    195270
Batched Product                    195270
dtype: int64

### QC of POE (EC/IC) 

In [None]:
def critical_depth_modified_b31g(od, wt, s, p, fL, units="SI"):
    """
    Calculates the failure stress using the Modified B31G Equation
    :param od:  Pipe outside diameter, in mm (SI), or inches (US)
    :param wt:  Pipe wall thickness, in mm (SI), or inches (US)
    :param s:   Pipe grade, in kPa (SI), or psi (US)
    :param p:   pressure, in kPa (SI), or psi (US)
    :param fL:  feature length, in mm (SI), or inches (US)
    :param units: flag for which units to use, "SI" or "US", default "SI"
    :return: Critical depth, in mm (SI), or inches (US)
    """

    l2Dt = np.power(fL, 2.0)/(od*wt)
    Mt = np.where(l2Dt <= 50.0,
                  np.sqrt( 1.0 +(0.6275*l2Dt)-(0.003375*np.power(l2Dt, 2.0))),
                  0.032*l2Dt+3.3)
    if units=="SI":
        flowS = s + 68947.6
    else:
        flowS = s + 10000.0

    opStress = (p*od)/(2.*wt)
    
    critical_d = ((opStress-flowS)*wt)/(0.85*((opStress/Mt)-flowS))
#     return np.minimum(critical_d/wt,0.8)
    return critical_d

def statistical_poe(df, m_cgr=0.3048, sd_cgr=0.3048*0.25):
    ## CAUTION: Equation for now creates additional columns in the input dataframe
    # ILI Age in years
#     df = i_df.copy()
    df.loc[:,'ILI Age'] = (pd.datetime.today() - pd.to_datetime(df.loc[:,'ILIRStartDate'])).dt.days/365.25

    # Measured Depth in mm
    df.loc[:,'depth_run_mm'] = df.loc[:,'depth_fraction']*df.loc[:,'WT_mm']

    # Failure Depth in mm
    df.loc[:, 'failure_depth_mm'] = critical_depth_modified_b31g(df.loc[:,'OD_inch']*25.4, df.loc[:,'WT_mm'], df.loc[:,'grade_MPa']*1000., df.loc[:,'MAOP_kPa'], df.loc[:,'length_mm'])
    
    # Mean of Depth in mm
    df.loc[:,'mean_depth_mm'] = (df.loc[:,'WT_mm']*0.00) + df.loc[:,'depth_run_mm'] + (df.loc[:,'ILI Age']*m_cgr)
    
    # SD of Depth in mm
    df.loc[:,'sd_depth_mm'] = np.sqrt(  np.power(0.078*df.loc[:,'WT_mm'],2) +  np.power(df.loc[:,'ILI Age'],2) * np.power(sd_cgr, 2))
    
    df.loc[:,'leak_poe'] = 1.0 - norm.cdf(0.80*df.loc[:,'WT_mm'], loc=df.loc[:,'mean_depth_mm'], scale= df.loc[:,'sd_depth_mm'])
    df.loc[:,'rupture_poe'] = 1.0 - norm.cdf(df.loc[:,'failure_depth_mm'], loc=df.loc[:,'mean_depth_mm'], scale= df.loc[:,'sd_depth_mm'])

    return np.where(df.loc[:,'failure_depth_mm']/df.loc[:,'WT_mm']> 0.80, df.loc[:,'leak_poe'], df.loc[:,'rupture_poe'])


In [None]:
get_ili_ranges("LS2000")

In [None]:
qcdf = get_features_for_poe(276)

In [None]:
pd.DataFrame(qcdf.columns, columns=['col']).query("col.str.contains('', case=False)")

In [None]:
display(qcdf.assign(poe = statistical_poe(qcdf, m_cgr=0.0, sd_cgr=0.0*0.25),
                    pct_smys=lambda x: (x.MAOP_kPa*x.OD_inch*25.4)/(20*x.WT_mm*x.grade_MPa)).query("chainage_m.between(258.391,267.592) & ILIFSurfaceInd =='E'").drop_duplicates('FeatureID'))
1- np.prod(1 - qcdf.assign(poe = statistical_poe(qcdf, m_cgr=0.0, sd_cgr=0.0*0.25)).query("chainage_m.between(258.391,267.592) & ILIFSurfaceInd =='E'").drop_duplicates('FeatureID').poe)


In [None]:
(50.0*4.78/(10*100))

## QC PBC CF MD ENV

In [35]:
%%time
results = pd.read_csv(r"Pipeline_PBC_CF_MD_ENV.csv")

# results = pd.read_excel(r"",skiprows=3, header=0)
results.info(memory_usage='deep')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230855 entries, 0 to 230854
Columns: 223 entries, Pipeline System to _Matrix_Row_Label
dtypes: float64(179), int64(1), object(43)
memory usage: 873.6 MB
Wall time: 4.61 s


In [36]:
display(Markdown("# Percent of columns empty"))
(100.00 - results.describe(include='all').loc['count',:][lambda x: x <= 230855]*100.00/230855).sort_values().to_frame("emptySegment Percent")

# Percent of columns empty

Unnamed: 0,emptySegment Percent
Pipeline System,0.0
MD Adjustment Factor,0.0
Manufacturing Defect Score,0.0
MD Hard Spot Score,0.0
Hard Spot Repair Score,0.0
Hard Spot Susceptibility Score,0.0
Coating Condition Score,0.0
Stress Level Score,0.0
Susceptibility to Hard Spots Score,0.0
MD Method 1 Inferential Failure Frequency perkmyr,0.0


In [13]:
display(results.columns.to_frame().rename(columns={0:'cols'}).query("cols.str.contains('')").sort_values(by='cols'))
# results.drop(columns=results.columns.to_frame().rename(columns={0:'cols'}).query("cols.str.contains('.\d$')").cols.values, inplace=True)

Unnamed: 0,cols
Batch 1 Time Pct,Batch 1 Time Pct
Batch Product 1,Batch Product 1
Batch Product 2,Batch Product 2
Batch Product Flag,Batch Product Flag
Batched Product,Batched Product
Begin Measure (m),Begin Measure (m)
CNW2 Count,CNW2 Count
DWA2 Count,DWA2 Count
Date of last ILI MFL,Date of last ILI MFL
EC ILI Failure Frequency perkmyr,EC ILI Failure Frequency perkmyr


In [171]:
results.groupby('Pipe Seam Type').sum()['Length (m)']

Pipe Seam Type
DSAW        944831.247
ERW          17536.129
FW           70882.925
SMAW           203.182
SSAW        416953.368
Seamless     14437.309
Unknown       1705.574
Name: Length (m), dtype: float64

In [148]:
display(Markdown("pipeline  segments with no MD crack ILI data"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)','Date of last ILI MD Crack']]\
.assign(eLength=lambda x: np.where(x['Date of last ILI MD Crack'].isnull(),x['Length (m)'],0.00))\
.groupby("Line Name").sum()[['Length (m)','eLength']]#.query('`Date of last ILI MD Crack`.isnull()')

# results.groupby('Line Name').sum()['Length (m)']['TMPL PETROCAN PRODUCTS 20IN KM']
# results[lambda x: x['Line Name'] == "TMPL PETROCAN PRODUCTS 20IN KM"][['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]

pipeline  segments with no MD crack ILI data

Unnamed: 0_level_0,Length (m),eLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,0.0
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,0.0
PUGET LAUREL-FERNDALE 16IN KM,18680.859,0.0
PUGET SUMAS-BORDER 24IN KM,8574.964,0.0
TMPL ANCHOR LOOP 36IN KM,150253.695,0.0
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,0.0
TMPL DARFIELD-BLACK PINES 24IN,41773.147,0.0
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,0.0
TMPL EDMONTON-EDSON 24IN KM,228770.922,0.0
TMPL EDSON-HINTON 30IN KM,88872.79,0.0


In [146]:
display(Markdown("ILI Check - segments with MD crack ILI date older than pipe install date"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['Date of last ILI MD Crack','Pipe Install Date']].apply(lambda x: pd.to_datetime(x)).query("`Date of last ILI MD Crack`<=`Pipe Install Date`"), how='inner')\
.groupby("Line Name").agg(['sum','count'])

ILI Check - segments with MD crack ILI date older than pipe install date

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m)
Unnamed: 0_level_1,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
TMPL EDMONTON-EDSON 24IN KM,3310739.934,21,3310849.787,21,109.853,21
TMPL PETROCAN PRODUCTS 20IN KM,27754.194,42,28545.09,42,790.896,42


In [144]:
display(Markdown("One pipeline containing segments with no PBC crack ILI data"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)','Date of last ILI PBC Crack']]\
.assign(eLength=lambda x: np.where(x['Date of last ILI PBC Crack'].isnull(),x['Length (m)'],0.00))\
.groupby("Line Name").sum()[['Length (m)','eLength']].query('eLength > 0')

# results.groupby('Line Name').sum()['Length (m)']['TMPL PETROCAN PRODUCTS 20IN KM']
# results[lambda x: x['Line Name'] == "TMPL PETROCAN PRODUCTS 20IN KM"][['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]

One pipeline containing segments with no PBC crack ILI data

Unnamed: 0_level_0,Length (m),eLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,4060.701
TMPL PETROCAN PRODUCTS 20IN KM,4821.841,4821.841
TMPL SUMAS-SUMASTANKFARM 20IN KM,3719.009,3719.009
TMPL SUMAS-SUMASTANKFARM 24IN KM,3883.994,3883.994


In [145]:
display(Markdown("ILI Check - segments with PBC crack ILI date older than pipe install date"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['Date of last ILI PBC Crack','Pipe Install Date']].apply(lambda x: pd.to_datetime(x)).query("`Date of last ILI PBC Crack`<=`Pipe Install Date`"), how='inner')\
.groupby("Line Name").agg(['sum','count'])

ILI Check - segments with PBC crack ILI date older than pipe install date

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m)
Unnamed: 0_level_1,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
PUGET BORDER-BURLINGTON 20IN KM,337315.567,10,337375.884,10,60.317,10
TMPL EDMONTON-EDSON 24IN KM,3310739.934,21,3310849.787,21,109.853,21


In [103]:
return_aggregate_empty(results, 'Equivalent Pressure Cycles (-)', 'pipelines with missing data')[1]

Equivalent Pressure Cycles (-): - pipelines with missing data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,4060.701
TMPL PETROCAN PRODUCTS 20IN KM,4821.841,4821.841
TMPL SUMAS-SUMASTANKFARM 20IN KM,3719.009,3719.009
TMPL SUMAS-SUMASTANKFARM 24IN KM,3883.994,3883.994


In [104]:
return_aggregate_empty(results, 'Mainline Coating Type','pipelines with missing data')[1]

Mainline Coating Type: - pipelines with missing data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,0.047
TMPL EDSON-HINTON 30IN KM,88872.79,0.011
TMPL HARGREAVES-DARFIELD 24IN KM,274037.246,0.033
TMPL HINTON-HARGREAVES 24IN KM,150288.414,149907.296
TMPL PETROCAN PRODUCTS 20IN KM,4821.841,0.013
TMPL SUMAS-BURNABY 24IN KM,65271.352,0.073
TMPL SUMAS-SUMASTANKFARM 20IN KM,3719.009,0.038
TMPL SUMAS-SUMASTANKFARM 24IN KM,3883.994,0.105


In [105]:
return_aggregate_empty(results,'CT Xray Inspection (-)','pipelines with missing data')[1]

CT Xray Inspection (-): - pipelines with missing data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147
TMPL HINTON-HARGREAVES 24IN KM,150288.414,150288.414


In [106]:
return_aggregate_empty(results,'ENV Outflow Volume Rupture Range', 'pipelines with no data')[1]

ENV Outflow Volume Rupture Range: - pipelines with no data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147
TMPL HINTON-HARGREAVES 24IN KM,150288.414,150288.414
TMPL KAMLOOPS-SUMAS 24IN KM,259026.247,687.427


In [119]:
return_aggregate_empty(results,'Date of last Circumferential ILI','10 pipelines with no circumferential crack ILI data')[1]

Date of last Circumferential ILI: - 10 pipelines with no circumferential crack ILI data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,68979.885
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14567.853
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18680.859
TMPL ANCHOR LOOP 36IN KM,150253.695,150253.695
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,4060.701
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147
TMPL HINTON-HARGREAVES 24IN KM,150288.414,150288.414
TMPL PETROCAN PRODUCTS 20IN KM,4821.841,4821.841
TMPL SUMAS-SUMASTANKFARM 20IN KM,3719.009,3719.009
TMPL SUMAS-SUMASTANKFARM 24IN KM,3883.994,3883.994


In [125]:
display(Markdown("ILI Check - segments with CF crack ILI date older than pipe install date"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['Date of last Circumferential ILI','Pipe Install Date']].apply(lambda x: pd.to_datetime(x)).query("`Date of last Circumferential ILI`<=`Pipe Install Date`"), how='inner')\
.groupby("Line Name").agg(['sum','count'])

ILI Check - segments with CF crack ILI date older than pipe install date

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m)
Unnamed: 0_level_1,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
TMPL EDMONTON-EDSON 24IN KM,3310739.934,21,3310849.787,21,109.853,21


In [123]:
return_aggregate_empty(results,'Soil Type','pipelines with missing data')[1]

Soil Type: - pipelines with missing data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL ANCHOR LOOP 36IN KM,150253.695,62389.146
TMPL DARFIELD-BLACK PINES 24IN,41773.147,11.096
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,1911.664
TMPL HARGREAVES-DARFIELD 24IN KM,274037.246,103093.386
TMPL HINTON-HARGREAVES 24IN KM,150288.414,62512.222
TMPL KAMLOOPS-SUMAS 24IN KM,259026.247,49144.213


In [140]:
return_aggregate_empty(results,'ENV Stream Trace Length Rupture Range','incomplete coverage of stream trace')[1]

ENV Stream Trace Length Rupture Range: - incomplete coverage of stream trace

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,46204.008
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14182.499
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18680.859
PUGET SUMAS-BORDER 24IN KM,8574.964,7276.785
TMPL ANCHOR LOOP 36IN KM,150253.695,52311.354
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,1861.089
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,11397.619
TMPL EDMONTON-EDSON 24IN KM,228770.922,137832.627
TMPL EDSON-HINTON 30IN KM,88872.79,28196.169


In [143]:
return_aggregate_empty(results,'Soil Slope Angle (-)','incomplete coverage of data')[1]

Soil Slope Angle (-): - incomplete coverage of data

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,68709.732
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14567.853
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18680.859
PUGET SUMAS-BORDER 24IN KM,8574.964,0.792
TMPL ANCHOR LOOP 36IN KM,150253.695,150253.695
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,26.036
TMPL DARFIELD-BLACK PINES 24IN,41773.147,1.587
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,37.485
TMPL EDMONTON-EDSON 24IN KM,228770.922,228770.922
TMPL EDSON-HINTON 30IN KM,88872.79,88872.79


In [151]:
return_aggregate_empty(results,'ENV Aquifer Area (Y/N)','incomplete data coverage')[1]

ENV Aquifer Area (Y/N): - incomplete data coverage

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,68728.443
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14567.853
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18680.859
PUGET SUMAS-BORDER 24IN KM,8574.964,8574.964
TMPL ANCHOR LOOP 36IN KM,150253.695,150253.695
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,1473.193
TMPL DARFIELD-BLACK PINES 24IN,41773.147,3122.575
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,23960.257
TMPL EDMONTON-EDSON 24IN KM,228770.922,173074.704
TMPL EDSON-HINTON 30IN KM,88872.79,88872.79


In [154]:
return_aggregate_empty(results,'ENV Open Water Area (Y/N)','incomplete data coverage')[1]

ENV Open Water Area (Y/N): - incomplete data coverage

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,59460.683
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,9121.553
PUGET LAUREL-FERNDALE 16IN KM,18680.859,17520.927
PUGET SUMAS-BORDER 24IN KM,8574.964,8574.964
TMPL ANCHOR LOOP 36IN KM,150253.695,86695.814
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,1333.811
TMPL DARFIELD-BLACK PINES 24IN,41773.147,29292.735
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,42259.332
TMPL EDMONTON-EDSON 24IN KM,228770.922,218764.128
TMPL EDSON-HINTON 30IN KM,88872.79,47896.189


In [161]:
# Date of last Hydrotest
# Last Hydrotest Pressure kPa

return_aggregate_empty(results,'Date of last Hydrotest', 'one pipeline with incomplete coverage of hydrotest range')[1].query("`Length (m)`-emptyLength  == 0 ")

Date of last Hydrotest: - one pipeline with incomplete coverage of hydrotest range

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,68979.885
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14567.853
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18680.859
PUGET SUMAS-BORDER 24IN KM,8574.964,8574.964
TMPL ANCHOR LOOP 36IN KM,150253.695,150253.695
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,4060.701
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,80966.815
TMPL EDMONTON-EDSON 24IN KM,228770.922,228770.922
TMPL EDSON-HINTON 30IN KM,88872.79,88872.79


In [49]:
display(Markdown("CIS readings - following pipelines with no CIS readings"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['EC CIS OnOff Total Reading Count','EC CIS OnOnly Total Reading Count']].assign(tot= lambda x: x.iloc[:,0]+x.iloc[:,1]).query("tot > 0"))\
.groupby("Line Name").agg(['sum','count'])

CIS readings - following pipelines with no CIS readings

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m),EC CIS OnOff Total Reading Count,EC CIS OnOff Total Reading Count,EC CIS OnOnly Total Reading Count,EC CIS OnOnly Total Reading Count,tot,tot
Unnamed: 0_level_1,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
PUGET BORDER-BURLINGTON 20IN KM,175429200.0,4071,175498200.0,4071,68979.885,4071,1427331.0,4071,0.0,4071,1427331.0,4071
PUGET BURLINGTON-ANACORTES 16IN KM,5741709.0,726,5756277.0,726,14567.853,726,282953.0,726,0.0,726,282953.0,726
PUGET LAUREL-FERNDALE 16IN KM,10748780.0,1136,10767460.0,1136,18680.859,1136,469681.0,1136,0.0,1136,469681.0,1136
PUGET SUMAS-BORDER 24IN KM,2321804.0,603,2330379.0,603,8574.964,603,129957.0,588,0.0,588,129957.0,588
TMPL ANCHOR LOOP 36IN KM,7907791000.0,20072,7907941000.0,20072,150253.695,20072,3960141.0,19808,0.0,19808,3960141.0,19808
TMPL BURNABY-WESTRIDGE 24IN KM,1979642.0,1049,1983703.0,1049,4060.701,1049,175698.0,1049,0.0,1049,175698.0,1049
TMPL DARFIELD-BLACK PINES 24IN,3814113000.0,4996,3814155000.0,4996,41773.147,4996,532338.0,4996,0.0,4996,532338.0,4996
TMPL DARFIELD-KAMLOOPS 30IN KM,12018090000.0,15343,12018180000.0,15343,80966.815,15343,3408963.0,15188,0.0,15188,3408963.0,15188
TMPL EDMONTON-EDSON 24IN KM,3653547000.0,32421,3653776000.0,32421,228770.922,32421,5997965.0,32161,0.0,32161,5997965.0,32161
TMPL EDSON-HINTON 30IN KM,3483008000.0,12655,3483097000.0,12655,88872.79,12655,2296625.0,12655,0.0,12655,2296625.0,12655


## QC of EF SO

In [172]:
%%time
results = pd.read_csv(r"Pipeline_EF_SO.csv")

# results = pd.read_excel(r"",skiprows=3, header=0)
results.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230855 entries, 0 to 230854
Data columns (total 32 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Pipeline System                               230855 non-null  object 
 1   Line Name                                     230855 non-null  object 
 2   LineID                                        230855 non-null  int64  
 3   GridStartMeasure                              230855 non-null  float64
 4   GridEndMeasure                                230855 non-null  float64
 5   Begin Measure (m)                             230855 non-null  float64
 6   End Measure (m)                               230855 non-null  float64
 7   Length (m)                                    230855 non-null  float64
 8   Equipment Failure Frequency perkmyr           230855 non-null  float64
 9   EF Adjustment Factor                          23

In [173]:
display(Markdown("# Percent of columns empty"))
(100.00 - results.describe(include='all').loc['count',:][lambda x: x <= 230855]*100.00/230855).sort_values().to_frame("emptySegment Percent")

# Percent of columns empty

Unnamed: 0,emptySegment Percent
Pipeline System,0.0
_Matrix_Legend_Label,0.0
Matrix_Total_Consequence,0.0
Matrix_Total_Probability,0.0
Impact on Population Rupture (50%),0.0
Impact on Environment Rupture (50%),0.0
Consequence,0.0
Unplanned Shut Down Frequency Score,0.0
Flow Condition Score,0.0
System Operations Score,0.0


In [182]:
display(Markdown("MLV and Flange count"))
results[['Line Name','MLV Count Outside Facility per dynseg','Flange Count Outside Facility per dynseg']]\
.assign(total=lambda x: x.iloc[:,1]+x.iloc[:,2])\
.query('total > 0').groupby('Line Name').sum()

MLV and Flange count

Unnamed: 0_level_0,MLV Count Outside Facility per dynseg,Flange Count Outside Facility per dynseg,total
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PUGET BORDER-BURLINGTON 20IN KM,8.0,0.0,8.0
PUGET BURLINGTON-ANACORTES 16IN KM,4.0,2.0,6.0
PUGET LAUREL-FERNDALE 16IN KM,5.0,7.0,12.0
PUGET SUMAS-BORDER 24IN KM,0.0,1.0,1.0
TMPL ANCHOR LOOP 36IN KM,16.0,0.0,16.0
TMPL BURNABY-WESTRIDGE 24IN KM,0.0,1.0,1.0
TMPL DARFIELD-BLACK PINES 24IN,1.0,0.0,1.0
TMPL DARFIELD-KAMLOOPS 30IN KM,9.0,0.0,9.0
TMPL EDMONTON-EDSON 24IN KM,11.0,6.0,17.0
TMPL EDSON-HINTON 30IN KM,5.0,6.0,11.0


## QC of NH

In [183]:
%%time
results = pd.read_csv(r"Pipeline_NH.csv")

# results = pd.read_excel(r"",skiprows=3, header=0)
results.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230855 entries, 0 to 230854
Data columns (total 29 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   Pipeline System                                   230855 non-null  object 
 1   Line Name                                         230855 non-null  object 
 2   LineID                                            230855 non-null  int64  
 3   GridStartMeasure                                  230855 non-null  float64
 4   GridEndMeasure                                    230855 non-null  float64
 5   Begin Measure (m)                                 230855 non-null  float64
 6   End Measure (m)                                   230855 non-null  float64
 7   Length (m)                                        230855 non-null  float64
 8   Natural Hazards Threat Failure Frequency perkmyr  230855 non-null  float64
 9   ILI 



In [184]:
display(Markdown("# Percent of columns empty"))
(100.00 - results.describe(include='all').loc['count',:][lambda x: x <= 230855]*100.00/230855).sort_values().to_frame("emptySegment Percent")

# Percent of columns empty

Unnamed: 0,emptySegment Percent
Pipeline System,0.0
_Matrix_Legend_Label,0.0
Matrix_Total_Consequence,0.0
Matrix_Total_Probability,0.0
Impact on Population Rupture (50%),0.0
Impact on Environment Rupture (50%),0.0
Consequence,0.0
Overall Individual Geohazard Probability peryr,0.0
_Matrix_Column_Label,0.0
NH Threat Failure Frequency perkmyr,0.0


In [208]:
return_aggregate_empty(results,'BGC GeotechnicalSite Id','distribution of data')[1].assign(populatedLength = lambda x: x['Length (m)'] - x['emptyLength'])

BGC GeotechnicalSite Id: - distribution of data

Unnamed: 0_level_0,Length (m),emptyLength,populatedLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,68544.885,435.0
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14517.853,50.0
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18680.859,0.0
PUGET SUMAS-BORDER 24IN KM,8574.964,8574.964,0.0
TMPL ANCHOR LOOP 36IN KM,150253.695,147014.694,3239.001
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,3930.701,130.0
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147,0.0
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,79346.816,1619.999
TMPL EDMONTON-EDSON 24IN KM,228770.922,227690.922,1080.0
TMPL EDSON-HINTON 30IN KM,88872.79,88330.79,542.0


In [209]:
return_aggregate_empty(results,'BGC HydrotechnicalSite Id','distribution of data')[1].assign(populatedLength = lambda x: x['Length (m)'] - x['emptyLength'])

BGC HydrotechnicalSite Id: - distribution of data

Unnamed: 0_level_0,Length (m),emptyLength,populatedLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PUGET BORDER-BURLINGTON 20IN KM,68979.885,68759.385,220.5
PUGET BURLINGTON-ANACORTES 16IN KM,14567.853,14399.854,167.999
PUGET LAUREL-FERNDALE 16IN KM,18680.859,18575.859,105.0
PUGET SUMAS-BORDER 24IN KM,8574.964,8537.964,37.0
TMPL ANCHOR LOOP 36IN KM,150253.695,149245.196,1008.499
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,4059.201,1.5
TMPL DARFIELD-BLACK PINES 24IN,41773.147,41773.147,0.0
TMPL DARFIELD-KAMLOOPS 30IN KM,80966.815,79668.813,1298.002
TMPL EDMONTON-EDSON 24IN KM,228770.922,228107.525,663.397
TMPL EDSON-HINTON 30IN KM,88872.79,88769.289,103.501


## QC of MDI MDR

In [197]:
df[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)','Land Use','Location Factor C Score']]\
.groupby('Land Use').agg({'Location Factor C Score':np.mean})

KeyError: "['Location Factor C Score'] not in index"

In [13]:
%%time
results = pd.read_csv(r"Pipeline_MDI_MDR.csv")

# results = pd.read_excel(r"",skiprows=3, header=0)
results.info(memory_usage='deep')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230855 entries, 0 to 230854
Data columns (total 92 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   Pipeline System                                        230855 non-null  object 
 1   Line Name                                              230855 non-null  object 
 2   LineID                                                 230855 non-null  int64  
 3   GridStartMeasure                                       230855 non-null  float64
 4   GridEndMeasure                                         230855 non-null  float64
 5   Begin Measure (m)                                      230855 non-null  float64
 6   End Measure (m)                                        230855 non-null  float64
 7   Length (m)                                             230855 non-null  float64
 8   Mechanical Damage Failure Frequenc

In [14]:
display(Markdown("# Percent of columns empty"))
(100.00 - results.describe(include='all').loc['count',:][lambda x: x <= 230855]*100.00/230855).sort_values().to_frame("emptySegment Percent")

# Percent of columns empty

Unnamed: 0,emptySegment Percent
Pipeline System,0.0
Operator Response Time,0.0
B7 Patrol Frequency,0.0
Patrol Frequency per year (-),0.0
B5 Third Party Notification,0.0
Third Party Notification,0.0
B4 Buried Markers,0.0
Markers,0.0
B9 Operator Response Time,0.0
B3 Signage,0.0


In [16]:
return_aggregate_empty(results,"Date of last ILI Dent Detection")[1]

Date of last ILI Dent Detection: - None

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1


In [17]:
display(Markdown("ILI Check - segments with dent ILI date older than pipe install date"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['Date of last ILI Dent Detection','Pipe Install Date']].apply(lambda x: pd.to_datetime(x)).query("`Date of last ILI Dent Detection`<=`Pipe Install Date`"), how='inner')\
.groupby("Line Name").agg(['sum','count'])

ILI Check - segments with dent ILI date older than pipe install date

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m)
Unnamed: 0_level_1,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
TMPL EDMONTON-EDSON 24IN KM,3310739.934,21,3310849.787,21,109.853,21
TMPL PETROCAN PRODUCTS 20IN KM,27754.194,42,28545.09,42,790.896,42


In [19]:
display(Markdown("Distinct values of pipe toughness"))
results['Pipe Toughness (-)'].unique()

Distinct values of pipe toughness

array([21.7])

In [21]:
return_aggregate_empty(results,"Pipe Toughness (-)","fully populated")[1]

Pipe Toughness (-): - fully populated

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1


In [22]:
return_aggregate_empty(results,"Equivalent Pressure Cycles (-)")[1]

Equivalent Pressure Cycles (-): - None

Unnamed: 0_level_0,Length (m),emptyLength
Line Name,Unnamed: 1_level_1,Unnamed: 2_level_1
TMPL BURNABY-WESTRIDGE 24IN KM,4060.701,4060.701
TMPL PETROCAN PRODUCTS 20IN KM,4821.841,4821.841
TMPL SUMAS-SUMASTANKFARM 20IN KM,3719.009,3719.009
TMPL SUMAS-SUMASTANKFARM 24IN KM,3883.994,3883.994


In [34]:
results['3PD-R Maximum Pressure at Stress Cycle (kPa) (-)'][lambda x: x < 0]

display(Markdown("3PD-R Maximum Pressure at Stress Cycle (kPa) (-) Check - negative pressures"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']]\
.join(results[['3PD-R Maximum Pressure at Stress Cycle (kPa) (-)']].query("`3PD-R Maximum Pressure at Stress Cycle (kPa) (-)` < 0"), how='inner')\
.groupby("Line Name").agg(['sum','count'])

3PD-R Maximum Pressure at Stress Cycle (kPa) (-) Check - negative pressures

Unnamed: 0_level_0,Begin Measure (m),Begin Measure (m),End Measure (m),End Measure (m),Length (m),Length (m),3PD-R Maximum Pressure at Stress Cycle (kPa) (-),3PD-R Maximum Pressure at Stress Cycle (kPa) (-)
Unnamed: 0_level_1,sum,count,sum,count,sum,count,sum,count
Line Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
TMPL KAMLOOPS-SUMAS 24IN KM,24129420.0,25,24129460.0,25,41.407,25,-7475.0,25


In [70]:
display(Markdown("Following pipelines have dynamic segments that start and end at the exact location. Could we check for overlaps?"))
results[['Line Name','Begin Measure (m)','End Measure (m)','Length (m)']].query("`Begin Measure (m)`==`End Measure (m)`")\
# .to_clipboard()
#.groupby('Line Name').aggregate({'Begin Measure (m)': lambda x: ", ".join(pd.Series(x).unique().round(4).astype(str))})

Following pipelines have dynamic segments that start and end at the exact location. Could we check for overlaps?

Unnamed: 0,Line Name,Begin Measure (m),End Measure (m),Length (m)
31,PUGET BORDER-BURLINGTON 20IN KM,8762.699,8762.699,0.0
38,PUGET BORDER-BURLINGTON 20IN KM,8872.626,8872.626,0.0
44,PUGET BORDER-BURLINGTON 20IN KM,8942.121,8942.121,0.0
736,PUGET BORDER-BURLINGTON 20IN KM,23696.529,23696.529,0.0
1073,PUGET BORDER-BURLINGTON 20IN KM,31573.292,31573.292,0.0
...,...,...,...,...
200963,TMPL KAMLOOPS-SUMAS 24IN KM,996272.177,996272.177,0.0
201008,TMPL KAMLOOPS-SUMAS 24IN KM,996366.838,996366.838,0.0
201117,TMPL KAMLOOPS-SUMAS 24IN KM,996992.409,996992.409,0.0
201507,TMPL KAMLOOPS-SUMAS 24IN KM,999643.067,999643.067,0.0


## QC of Drain Factor

In [None]:
def drain_factor(emin, emed, emax, eloc):
    temp_df = np.where(eloc.between(emin, emed), 1 - 0.5*((eloc-emin)/(emed-emin)),
                      0.5*((emax-eloc)/(emax-emed)))
    return temp_df

In [None]:
drain_factor(pd.Series(675.84), pd.Series(698.28), pd.Series(718.23), pd.Series(702.33))

## QC of Release Rate Calculations

In [None]:
def damage_area_sqft(product, release_rate_kgps):
    release_rate_lbps = release_rate_kgps*2.20462
    i_c = np.where(product == 'Natural Gas', 41.0,
                                  np.where(product == 'Ethane Plus', 28.,
                                          np.where(product == 'Condensate', 4.35,
                                                  np.where(product.isin(['Diluent','CLPL Diluent']), 3.3,
                                                           0.03))))
    
    i_p = np.where(product == 'Natural Gas', 0.67,
                              np.where(product == 'Ethane Plus', 0.72,
                                      np.where(product == 'Condensate', 0.78,
                                              np.where(product.isin(['Diluent','CLPL Diluent']), 0.76,
                                                       0.99))))
        
    c_c = np.where(product == 'Natural Gas', 43.0,
                          np.where(product == 'Ethane Plus', 49.48,
                                  np.where(product == 'Condensate', 182.0,
                                          np.where(product.isin(['Diluent','CLPL Diluent']), 130.,
                                                   11.0))))

    c_p = np.where(product == 'Natural Gas', 0.98,
                      np.where(product == 'Ethane Plus', 1.0,
                              np.where(product == 'Condensate', 0.89,
                                      np.where(product.isin(['Diluent','CLPL Diluent']), 0.90,
                                               0.91))))
    
    
    temp_area = np.where((release_rate_kgps)>25.20159, i_c*np.power(180.*release_rate_lbps,i_p),
                        c_c*np.power(release_rate_lbps,c_p))
    
    return temp_area

def release_rate_kgps(state, cp, density, mw, maop, t, hole_a):
    k = cp / (cp - 8.314)
    tp = 101.325*np.power( (k+1)/2 ,k/(k-1))
    t_K = t+273.15
    
    regime = np.where(maop > tp, 'supersonic', 'subsonic')
    
    temp_rel = np.where(state == 'Liquid', 0.62*hole_a*np.sqrt(2*density*maop*1000.),
                       np.where(regime=='supersonic',0.9*hole_a*maop*1000.*np.sqrt( ((k*mw)/(8314.4598*t_K))*np.power(2/(k+1), (k+1)/(k-1))),
                               0.9*hole_a*maop*1000.*np.sqrt( (mw/(8314.4598*t_K))*(2*k/(k-1))*np.power(101.325/maop, 2/k) * (1 - np.power(101.325/maop,(k-1)/k)))))
    
    return temp_rel

In [None]:
# damage_area_sqft(pd.Series(['Ethane Plus']), pd.Series([9.44]))

# results.loc[:,['Product_Type','Release_Rate_kgpersec','Damage_Area_sqft']].assign(qc = lambda x: damage_area_sqft(x.Product_Type, x.Release_Rate_kgpersec),
#                                                                                  pct_error = lambda x: abs(x.qc-x.Damage_Area_sqft)*100/x.Damage_Area_sqft).pct_error.describe()

results.loc[:,['Product_Type',
               'Mixture_State',
              'Mixture_Heat_Capacity',
              'Mixture_Density_kgperm3',
              'Mixture_Molecular_Weight_kgperkmol',
              'Maximum_Allowable_Operating_Pressure_Kilopascal',
              'Operating_Temperature_Celsius',
              'Hole_Area_sqm',
              'Release_Rate_kgpersec',
              'Damage_Area_sqft']].fillna({'Operating_Temperature_Celsius':0.0}).assign(qc_rel = lambda x: release_rate_kgps(x.Mixture_State,
                                                                                                                     x.Mixture_Heat_Capacity,
                                                                                                                     x.Mixture_Density_kgperm3,
                                                                                                                     x.Mixture_Molecular_Weight_kgperkmol,
                                                                                                                     x.Maximum_Allowable_Operating_Pressure_Kilopascal,
                                                                                                                     x.Operating_Temperature_Celsius,
                                                                                                                     x.Hole_Area_sqm),
                                                                                            pct_error_rel = lambda x: abs(x.qc_rel-x.Release_Rate_kgpersec)*100/x.Release_Rate_kgpersec,
                                                                                            qc_dam = lambda x: damage_area_sqft(x.Product_Type, x.qc_rel),
                                                                                            pct_error_dam = lambda x: abs(x.qc_dam-x.Damage_Area_sqft)*100/x.Damage_Area_sqft)[['pct_error_rel','pct_error_dam']].describe()

In [None]:
results.loc[lambda x: x.Land_Use == 'WATER COURSE',['Bank_Full_Width_of_Watercourse_m','Outside_Diameter_Millimeter','Repair_Costs_CDN']].assign(repair = lambda x: water_repair(x.Bank_Full_Width_of_Watercourse_m, x.Outside_Diameter_Millimeter),
                                                                                                                                                qc = lambda x: abs(x.repair-x.Repair_Costs_CDN)*100./x.Repair_Costs_CDN).query("qc > 1.0")

# Risk Analysis

In [33]:
df1 = pd.read_excel("TMC_20201023_RES_CONSQ.xlsx", sheet_name='ResultsSheet1',header=3, skip_rows=3)

In [34]:
df2 = pd.read_excel("TPDxCon_Res_02072020.xlsx", sheet_name='ResultsSheet1',header=3, skip_rows=3)

In [35]:
def build_matrix(df, threat, consequence, agg='sum'):
    temp = pd.concat([df,
               pd.cut(df[threat], [0, 1e-3, 1e-2, 1e-1, 1.0, np.inf], labels=['Unlikely I',
                                                                              'Unlikely II',
                                                                             'Rare',
                                                                             'Occasional',
                                                                             'Expected'], right=False).rename('Likelihood Category'),
              pd.cut(df[consequence], [-10, 0., 3., 10., 30., np.inf], labels=['Minor',
                                                                             'Moderate',
                                                                             'Major',
                                                                             'Critical', 
                                                                             'Catastrophic'],right=False).rename('Consequence Category')],
             axis =1)
    
    risk_ranks = {'ORM':{'Low':[('Unlikely I','Minor'),
                                ('Unlikely I','Moderate'),
                                ('Unlikely I','Major'),
                                ('Unlikely I','Critical'),
                                ('Unlikely II','Minor'),
                               ('Unlikely II','Moderate'),
                               ('Rare','Minor'),
                               ('Occasional','Minor')],
                     'Medium':[('Unlikely I','Catastrophic'),
                               ('Unlikely II','Major'),
                               ('Unlikely II','Critical'),
                               ('Rare','Moderate'),
                               ('Rare','Major'),
                              ('Occasional','Moderate'),
                              ('Expected','Minor')],
                     'High':[('Unlikely II','Catastrophic'),
                             ('Rare','Critical'),
                             ('Occasional','Major'),
                            ('Expected','Moderate')],
                     'Extreme':[('Rare','Catastrophic'),
                                ('Occasional','Catastrophic'),
                                ('Occasional','Critical'),
                               ('Expected','Catastrophic'),
                                ('Expected','Critical'),
                                ('Expected','Major')]}}

    temp.loc[:,'Risk Category'] = temp.loc[:,['Likelihood Category','Consequence Category']].apply(lambda x: np.where((x.loc['Likelihood Category'], x.loc['Consequence Category']) in risk_ranks['ORM']['Low'], 'Low',
                                                                                                                      np.where((x.loc['Likelihood Category'], x.loc['Consequence Category']) in risk_ranks['ORM']['Medium'], 'Medium',
                                                                                                                              np.where((x.loc['Likelihood Category'], x.loc['Consequence Category']) in risk_ranks['ORM']['High'], 'High',
                                                                                                                                      'Extreme'))), axis=1)
    temp.loc[:,'Risk Category'] =  temp.loc[:,'Risk Category'].transform(lambda x: str(x))
    
    temp_pt = temp.pivot_table(index='Likelihood Category', 
                                  columns='Consequence Category', 
                                  values='Length (m)',
                                  aggfunc=agg,
                                 dropna=False, margins=True, margins_name='Total',
                                 fill_value=0.00)
    
    temp_pt.index = pd.CategoricalIndex(temp_pt.index, categories=['Unlikely I',
                                                                  'Unlikely II',
                                                                 'Rare',
                                                                 'Occasional',
                                                                 'Expected'], ordered=True)
    return temp, temp_pt


def risk_matrix(x):

    green = 'background-color: #81b581'
    yellow = 'background-color: #ffff00'
    orange = 'background-color: #ffa600'
    red = 'background-color: #c21111'
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)
    df1.iloc[4, 0] = green
    df1.iloc[3, 0] = green
    df1.iloc[2, 0] = green
    df1.iloc[1, 0] = green
    df1.iloc[4, 1] = green
    df1.iloc[3, 1] = green
    df1.iloc[4, 2] = green
    df1.iloc[4, 3] = green
    df1.iloc[0, 0] = yellow
    df1.iloc[1, 1] = yellow
    df1.iloc[2, 1] = yellow
    df1.iloc[2, 2] = yellow
    df1.iloc[3, 2] = yellow
    df1.iloc[3, 3] = yellow
    df1.iloc[4, 4] = yellow
    df1.iloc[0, 1] = orange
    df1.iloc[1, 2] = orange
    df1.iloc[2, 3] = orange
    df1.iloc[3, 4] = orange
    df1.iloc[0, 2] = red
    df1.iloc[0, 3] = red
    df1.iloc[0, 4] = red
    df1.iloc[1, 3] = red
    df1.iloc[1, 4] = red
    df1.iloc[2, 4] = red
    return df1


In [44]:
filter_str = "~`Line Name`.str.contains('HINTON-HARGREAVES', case=False) & ~`Line Name`.str.contains('DARFIELD-BLACK PINES', case=False)"

In [50]:


df1_dat , df1_mat = build_matrix(df1.query(filter_str), 
             'Mechanical Damage Resident Failure Frequency perkmyr', 
             'Consequence', 
             agg='sum')

# ross_dat['Threat Driver'] = ((ross_dat[['External Corrosion (22%)',
#                                       'Stress Corrosion Cracking (2%)',
#                                       '3rd Party Damage (41%)',
#                                       'Manufacturing Defect (30%)',
#                                       'Construction Threat (5%)']])*np.array([0.22, 0.02, 0.41, 0.30, 0.05])).idxmax(axis=1)


df1_mat = df1_mat.sort_index(ascending=False).rename(index={np.nan:'Total'}).style.apply(risk_matrix, axis=None).set_properties(**{'width': '50px',
                                                                                        'height':'50px',
                                                                                        'border':'1px solid #aaaaaa',
                                                                                        'text-align':'center'}).set_table_styles([{'selector':'th','props':[('border','1px solid #aaaaaa')]}])

# green = '#92d050'
# blue = '#00b0f0'
# yellow = '#ffff00'
# red = '#ff0000'

# ross_plot = ross_dat.groupby('Risk Category').sum()['Length (m)']
# colors = ross_plot.index.map({'Low':green,'Medium':blue,'High':yellow,'Extreme':red}).to_list()
# ross_plot = ross_plot.plot(kind='bar', color=colors)
# ross_drive = ross_dat.pivot_table(index='Threat Driver', 
#                                   columns='Risk Category', 
#                                   values='Length (m)',
#                                   aggfunc=sum,
#                                  dropna=False, margins=True, margins_name='Total',
#                                  fill_value=0.00)

display(df1_mat)
# display(ross_plot)
plt.show()
# display(ross_drive)
# #.to_excel('rossdale_mat.xlsx')


#.background_gradient(cmap='Reds', axis=None)


Consequence Category,Minor,Moderate,Major,Critical,Catastrophic,Total
Likelihood Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Expected,0,0.0,21.437,10.307,211.009,242.753
Occasional,0,0.0,249.806,499.615,0.0,749.421
Rare,0,6.598,242.754,0.0,484.244,733.596
Unlikely II,0,0.0,8.939,184.756,56.054,249.749
Unlikely I,0,38796.188,235827.216,600514.678,397375.385,1272513.467
Total,0,38802.786,236350.152,601209.356,398126.692,1274488.986


In [46]:


df2_dat , df2_mat = build_matrix(df2, 
             'Mechanical Damage Resident Failure Frequency perkmyr', 
             'Consequence', 
             agg='sum')

# ross_dat['Threat Driver'] = ((ross_dat[['External Corrosion (22%)',
#                                       'Stress Corrosion Cracking (2%)',
#                                       '3rd Party Damage (41%)',
#                                       'Manufacturing Defect (30%)',
#                                       'Construction Threat (5%)']])*np.array([0.22, 0.02, 0.41, 0.30, 0.05])).idxmax(axis=1)


df2_mat = df2_mat.sort_index(ascending=False).rename(index={np.nan:'Total'}).style.apply(risk_matrix, axis=None).set_properties(**{'width': '50px',
                                                                                        'height':'50px',
                                                                                        'border':'1px solid #aaaaaa',
                                                                                        'text-align':'center'}).set_table_styles([{'selector':'th','props':[('border','1px solid #aaaaaa')]}])

# green = '#92d050'
# blue = '#00b0f0'
# yellow = '#ffff00'
# red = '#ff0000'

# ross_plot = ross_dat.groupby('Risk Category').sum()['Length (m)']
# colors = ross_plot.index.map({'Low':green,'Medium':blue,'High':yellow,'Extreme':red}).to_list()
# ross_plot = ross_plot.plot(kind='bar', color=colors)
# ross_drive = ross_dat.pivot_table(index='Threat Driver', 
#                                   columns='Risk Category', 
#                                   values='Length (m)',
#                                   aggfunc=sum,
#                                  dropna=False, margins=True, margins_name='Total',
#                                  fill_value=0.00)

display(df2_mat)
# display(ross_plot)
plt.show()
# display(ross_drive)
# #.to_excel('rossdale_mat.xlsx')


#.background_gradient(cmap='Reds', axis=None)


Consequence Category,Minor,Moderate,Major,Critical,Catastrophic,Total
Likelihood Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Expected,0,0,504.432,286.936,432.964,1224.332
Occasional,0,0,471.946,1923.781,1790.42,4186.147
Rare,0,0,457.502,242.754,1228.438,1928.694
Unlikely II,0,0,559.336,1785.619,622.257,2967.212
Unlikely I,0,0,210539.969,488953.034,564688.78,1264181.783
Total,0,0,212533.185,493192.124,568762.859,1274488.168


In [47]:
(df1_mat.data - df2_mat.data).style.apply(risk_matrix, axis=None).set_properties(**{'width': '50px',
                                                                                        'height':'50px',
                                                                                        'border':'1px solid #aaaaaa',
                                                                                        'text-align':'center'}).set_table_styles([{'selector':'th','props':[('border','1px solid #aaaaaa')]}])

Consequence Category,Minor,Moderate,Major,Critical,Catastrophic,Total
Likelihood Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Expected,0,0.0,-482.995,-276.629,-221.955,-981.579
Occasional,0,0.0,-222.14,-1424.166,-1790.42,-3436.726
Rare,0,6.598,-214.748,-242.754,-744.194,-1195.098
Unlikely II,0,0.0,-550.397,-1600.863,-566.203,-2717.463
Unlikely I,0,38796.188,25287.247,111561.644,-167313.395,8331.684
Total,0,38802.786,23816.967,108017.232,-170636.167,0.818


In [48]:
df1_mat.data['Total'].to_frame().merge(df2_mat.data['Total'].to_frame(), left_index=True, right_index=True, suffixes=('_current','_previous')).assign(meterChange = lambda x: (x.Total_current-x.Total_previous))

Unnamed: 0_level_0,Total_current,Total_previous,meterChange
Likelihood Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Expected,242.753,1224.332,-981.579
Occasional,749.421,4186.147,-3436.726
Rare,733.596,1928.694,-1195.098
Unlikely II,249.749,2967.212,-2717.463
Unlikely I,1272513.467,1264181.783,8331.684
Total,1274488.986,1274488.168,0.818
