# Inspections EDA

In [None]:
import sys
sys.path.append('../')

import pipeline.sql as plsql
import pipeline.eda as pleda
import eda.eda_functions as lceda

from plotnine import *
import pandas as pd

In [None]:
engine = plsql.create_engine('../config.yaml')

In [None]:
role = 'direccion_trabajo_inspections_write'

### Metadata

In [None]:
qry = """
        set role direccion_trabajo_inspections_write; select variable_name, 
        description_spanish, description_english from raw.metadata;
        """

In [None]:
pd.read_sql_query(qry, engine)

### Inspections complete

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select region, count(distinct ccae) as numindus, 
            count(distinct codoficina) as numoffice, sum(infra) as numinfra, count(*) as total
        from raw.inspections_complete
        group by region
        order by region;
    """

In [None]:
df = pd.read_sql_query(qry, engine)


In [None]:
(ggplot(df, aes('region', 'numoffice') ) +
    geom_bar(stat = 'identity', fill = "purple") +
    #scale_x_continuous(breaks = range(0, 16)) +
    theme_bw() + 
    #coord_flip() +
    ggtitle("Number of Offices by Region"))

In [None]:
(ggplot(df, aes('region', 'numinfra') ) +
    geom_bar(stat = 'identity', fill = "purple") +
    scale_x_continuous(breaks = range(0, 16)) +
    theme_bw() + 
    #coord_flip() +
    ggtitle("Number of Infractions by Region"))

In [None]:
(ggplot(df, aes('region', 'total') ) +
    geom_bar(stat = 'identity', fill = "purple") +
    #scale_x_continuous(breaks = range(0, 16)) +
    theme_bw() + 
    #coord_flip() +
    ggtitle("Number of Inspections by Region"))

In [None]:
(ggplot(df, aes('region', 'numindus') ) +
    geom_bar(stat = 'identity', fill = "purple") +
    #scale_x_continuous(breaks = range(0, 16)) +
    theme_bw() + 
    #coord_flip() +
    ggtitle("Number of Industries by Region"))

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select agno, region, count(distinct codoficina) as numoffice, sum(infra) as numinfra, count(*)p as total
        from raw.inspections_complete
        group by agno, region
        order by region, agno;
    """

In [None]:
df = pd.read_sql_query(qry, engine)


In [None]:
(ggplot(df, aes('agno', 'total')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region') + 
    ggtitle("Number of inspections over time by region"))

In [None]:
(ggplot(df, aes('agno', 'numoffice')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region') + 
    ggtitle("Number of offices over time by region"))

In [None]:
(ggplot(df, aes('agno', 'numinfra')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region', scales = 'free') + 
    ggtitle("Number of infractions over time by region"))

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select agno, count(distinct codoficina) as numoffice, sum(infra) as numinfra, count(*) as total
        from raw.inspections_complete
        group by agno
        order by agno;
    """

In [None]:
df = pd.read_sql_query(qry, engine)

In [None]:
(ggplot(df, aes('agno', 'numinfra')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of infractions over time") + 
    ylim(0,150000))

In [None]:
(ggplot(df, aes('agno', 'total')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of inspections over time") +
    ylim(0,200000))

In [None]:
(ggplot(df, aes('agno', 'numoffice')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of offices active each year") + 
    ylim(0,140))

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select region, count(distinct ccae) as numindus, sum(infra) as numinfra, count(*) as total
        from raw.inspections_complete
        group by region
        order by total;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df

In [None]:
df = pd.read_sql_query("""set role direccion_trabajo_inspections_write;
                       select agno, mesreg, count(*), sum(infra) from raw.inspections_complete 
                       group by agno, mesreg order by agno, mesreg;""", engine)

In [None]:
(ggplot(df, aes('mesreg', 'sum')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~agno') + 
    ggtitle("Number of infractions over months by year"))

In [None]:
(ggplot(df, aes('mesreg', 'count')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~agno') + 
    ggtitle("Number of inspections over months by year"))

In [None]:
plsql.query("select count(distinct rutempresamask), agno from raw.inspections_complete group by agno;")

In [None]:
plsql.query("select count(distinct rutempresamask), agno from raw.taxes group by agno;")

In [None]:
role = "set role direccion_trabajo_inspections_write;"
df = plsql.query("{} select cast(ntrabajadores as int), 1 as cat from raw.taxes;".format(role), engine)

In [None]:
pleda.count_nulls(engine, 'direccion_trabajo_inspections_write', 'raw', 'taxes', 'ntrabajadores')

In [None]:
df.ntrabajadores.min()

In [None]:
df.ntrabajadores.max()

In [None]:
df.ntrabajadores.mean()

In [None]:
df.ntrabajadores.median()

In [None]:
(ggplot(df, aes(x = 'cat', y = 'ntrabajadores')) + 
    geom_boxplot() +
    coord_flip())

In [None]:
(ggplot(df, aes('ntrabajadores')) + 
    geom_density())

In [None]:
df = plsql.query("{} select cast(emptrabhombres as int), 1 as cat from raw.inspections_complete where emptrabhombres != '99999';".format(role), engine)

In [None]:
pleda.count_nulls(engine, 'direccion_trabajo_inspections_write', 'raw', 'inspections_complete', 'emptrabhombres')

In [None]:
df.emptrabhombres.min()

In [None]:
df.emptrabhombres.max()

In [None]:
df.emptrabhombres.mean()

In [None]:
df.emptrabhombres.median()

In [None]:
(ggplot(df, aes(x = 'cat', y = 'emptrabhombres')) + 
    geom_boxplot() +
    coord_flip())

### NULL values

In [None]:
df_nulls = pleda.proportion_nulls_all_columns(engine, role, 'raw', 'inspections_complete')
df_nulls.head()

In [None]:
(ggplot(df_nulls, aes(x = 'column_name', y = "1", fill = 'proportion')) +
     geom_tile()+ 
    theme(axis_text_x = element_text(angle = 90),
          figure_size = (7, 2)) + 
    ggtitle("Inspections NULL"))

In [None]:
qry = """set role {};
    select *
    from raw.inspections_complete
    limit 3;""". format(role)
colnames = plsql.query(qry, engine).columns
colnames

In [None]:
len(colnames)

In [None]:
def description_colname(colname):
    print(colname)
    print('\nNulls:')
    print(pleda.count_nulls(engine, role, 'raw', 'inspections_complete', colname))
    tab = pleda.rows_by_group(engine, role, 'raw', 'inspections_complete', colname)
    print('\nDescription:')
    print(tab[colname].describe())
    print('\n')
    return tab

In [None]:
tab = description_colname(colnames[0])
# unique value
tab['count'].describe() 

In [None]:
tab = description_colname(colnames[1])
print(sum(tab.codoficina == 99))
print(pleda.count_nulls(engine, role, 'raw', 'inspections_complete', colnames[1], 
                        na = '99', ind = 'false'))
tab.shape

In [None]:
qry = """set role {};
    select * 
    from raw.office_data;
    """.format(role)
tt = tab.merge(plsql.query(qry, engine), left_on = 'codoficina', right_on = 'codeoffice', how='left')
tt.head()
tt[pd.isnull(tt['codeoffice'])].shape

In [None]:
tab = description_colname(colnames[2])
print(sum(tab.agno == 99))
print(pleda.count_nulls(engine, role, 'raw', 'inspections_complete', colnames[2]))
tab

In [None]:
tab = description_colname(colnames[3])
print(sum(tab.nrocomision == 99))
print(sum(tab.nrocomision == 9999))

In [None]:
qry = """set role {};
    select * 
    from raw.office_data;
    """.format(role)
tt = tab.merge(plsql.query(qry, engine), left_on = 'nrocomision', right_on = 'codeoffice', how='left')
print( tt.head() )
print( tab.shape )
print( tt[pd.isnull(tt['codeoffice'])].shape )
print( sum(pd.isnull(tt['codeoffice'])) )

In [None]:
tab = description_colname(colnames[4])

In [None]:
qry = """set role {};
    select 
    case when totalafectados in (9999, 'NaN') then NULL else totalafectados end as totalafectados_rec, count(*)
    from raw.inspections_complete
    group by totalafectados_rec;""".format(role)
tab = plsql.query(qry, engine)
sum(tab.totalafectados_rec == 9999)

In [None]:
description_colname(colnames[5])

In [None]:
description_colname(colnames[6])

In [None]:
description_colname(colnames[7])
description_colname(colnames[8])
qry = """set role {};
    select codtiposol, solicitante, count(*) 
    from raw.inspections_complete
    group by codtiposol, solicitante;
    """.format(role)
plsql.query(qry, engine)

In [None]:
description_colname(colnames[9])
description_colname(colnames[10])
qry = """set role {};
    select codunidadorigen, unidadorigen, count(*) 
    from raw.inspections_complete
    group by codunidadorigen, unidadorigen;
    """.format(role)
plsql.query(qry, engine)

In [None]:
description_colname(colnames[11])
description_colname(colnames[12])
qry = """set role {};
    select codtipotermino, tipotermino, count(*) 
    from raw.inspections_complete
    group by codtipotermino, tipotermino;
    """.format(role)
plsql.query(qry, engine)

In [None]:
print( description_colname(colnames[13]) )

In [None]:
tab = description_colname(colnames[14])
print( sum(tab.empdfcodcomuna == 99) )
print( sum(tab.empdfcodcomuna == 999) )
print( sum(tab.empdfcodcomuna == 9999) )
tab.empdfcodcomuna = tab.empdfcodcomuna.astype('float')
tab.empdfcodcomuna.describe()
print( tab[tab.empdfcodcomuna < 1] )
print( tab[tab.empdfcodcomuna > 3].empdfcodcomuna.describe() )
print( tab[tab.empdfcodcomuna > 1].empdfcodcomuna.nunique() )

In [None]:
tab = description_colname(colnames[15])
print( sum(tab.empdmcodcomuna == 99) )
print( sum(tab.empdmcodcomuna == 999) )
print( sum(tab.empdmcodcomuna == 9999) )
tab.empdmcodcomuna = tab.empdmcodcomuna.astype('float')
print( tab.empdmcodcomuna.describe() )
print( tab[tab.empdmcodcomuna < 1] )
print( tab[tab.empdmcodcomuna > 1].empdmcodcomuna.describe() )
print( tab[tab.empdmcodcomuna > 1].empdmcodcomuna.nunique() )

In [None]:
tab = description_colname(colnames[16])
print( sum(tab.emptrabhombres == 99) )
print( sum(tab.emptrabhombres == 999) )
print( sum(tab.emptrabhombres == 9999) )
print( sum(tab.emptrabhombres == 99999) )

In [None]:
tab = description_colname(colnames[17])
tab.codcae.nunique()
print( tab[tab.codcae.isin(['0', '1', '-1', 'None'])] )
print( tab[pd.isnull(tab.codcae)].codcae )

In [None]:
description_colname(colnames[18])

In [None]:
for i in range(19, 32):
    print(i)
    tab = description_colname(colnames[i])

In [None]:
print( description_colname(colnames[32]).head() )
print( description_colname(colnames[33]).head() )
qry = """set role {};
    select ccae, gcae, count(*) 
    from raw.inspections_complete
    group by ccae, gcae;
    """.format(role)
plsql.query(qry, engine)

In [None]:
print( description_colname(colnames[34]).head() )
print( description_colname(colnames[35]).head() )
qry = """set role {};
    select crae, grae, count(*) 
    from raw.inspections_complete
    group by crae, grae;
    """.format(role)
plsql.query(qry, engine)

In [None]:
tab = description_colname(colnames[36])
tab = description_colname(colnames[37])

In [None]:
tab = description_colname(colnames[38])
tab

In [None]:
description_colname(colnames[39])

In [None]:
description_colname(colnames[40])

In [None]:
print( description_colname(colnames[41]) )

In [None]:
print( description_colname(colnames[42]) )

In [None]:
print( description_colname(colnames[43]) )

In [None]:
tab = description_colname(colnames[44])
tab.head()
pd.to_datetime(tab.datereg).describe()

In [None]:
description_colname(colnames[45])

In [None]:
description_colname(colnames[46])

In [None]:
colnames

In [None]:
qry = """set role {};
    select idfiscalizacion, codoficina, 
    agno, 
    datereg, 
    mesreg, 
    date(agno || '-' || mesreg || '-01') as datereg_monthyear,
    nrocomision, 
    case when totalafectados in (9999, 'NaN') then NULL else totalafectados end as totalafectados_rec,
    urgencia, 
    solesafectado, 
    codtipotermino, tipotermino,
    case when empdfcodcomuna in ('0', '-1', '3') then NULL else empdfcodcomuna end as empdfcodcomuna_rec,
    case when empdmcodcomuna in ('0', '-1', 'None') then NULL else empdmcodcomuna end as empdmcodcomuna_rec,
    egresoconmulta, emptrabhombres, 
    codtipoempresa, 
    grupocodtipomaterias, 
    grupocodtipomaterias2,
    grupoglosatipomaterias,
    grupoglosatipomaterias2, 
    grupoglosainfra, 
    grupoglosainfra2, 
    grupocodigoinfra, 
    grupocodigoinfra2, 
    grupoglosainfra_det, 
    grupoglosainfra2_det, 
    grupocodigoinfra_det,
    grupocodigoinfra2_det, 
    grupocodigonormainfra2_det,
    grupocodigonormainfra2_det,
    case when codcae in ('0', '-1', 'None', '1') then NULL else codcae end as codcae_rec,
    ccae, gcae, 
    case when crae in ('1') then '101' else crae end as crae_rec,
    case when grae in ('AGRICULTURA, CAZA, SILVICULTURA Y PESCA                                         ') then 'agricultura, ganadería, caza y silvicultura' else lower(grae) end as grae_rec,
    infra, noinfra, derechofund, num_materias, num_sind, region, 
    infractor, exsind
    rutempresamask
    from raw.inspections_complete
    limit 4;""".format(role)
plsql.query(qry, engine)

In [None]:
qry = """SET ROLE {};
    SELECT idfiscalizacion, codoficina,  
    CASE WHEN grupocodtipomaterias = '99' THEN 
        ( CASE WHEN grupocodtipomaterias2 = '99' THEN 
            NULL ELSE 
            regexp_replace(grupocodtipomaterias2, ';', '|', 'g') 
            END) ELSE 
        ( CASE WHEN grupocodtipomaterias = '99' THEN 
            NULL ELSE 
            regexp_replace(grupocodtipomaterias, ',', '|', 'g') 
            END) END 
            AS grupocodtipomaterias_union,
    CASE WHEN grupoglosatipomaterias = '99' THEN 
        ( CASE WHEN grupoglosatipomaterias2 = '99' THEN 
            NULL ELSE 
            grupoglosatipomaterias2 
            END) ELSE 
        ( CASE WHEN grupoglosatipomaterias = '99' THEN 
            NULL ELSE 
            grupoglosatipomaterias END) 
            END 
            AS grupoglosatipomaterias_union,
    CASE WHEN grupoglosainfra = '99' THEN 
        ( CASE WHEN grupoglosainfra2 = '99' THEN 
            NULL ELSE 
            grupoglosainfra2 
            END) ELSE 
        ( CASE WHEN grupoglosainfra = '99' 
            THEN NULL ELSE 
            grupoglosainfra 
            END) END 
            AS grupoglosainfra_union,
    CASE WHEN grupocodigoinfra = '99' THEN 
        ( CASE WHEN grupocodigoinfra2 = '99' 
            THEN NULL ELSE 
            regexp_replace(grupocodigoinfra2 , ';', '|', 'g') 
            END) ELSE 
        ( CASE WHEN grupocodigoinfra = '99' 
            THEN NULL ELSE 
            regexp_replace(grupocodigoinfra , ',', '|', 'g') 
            END) END 
            AS grupocodigoinfra_union,
    CASE WHEN grupoglosainfra_det = '99' THEN 
        ( CASE WHEN grupoglosainfra2_det = '99' 
            THEN NULL ELSE 
            grupoglosainfra2_det 
            END) ELSE 
        ( CASE WHEN grupoglosainfra_det = '99' 
            THEN NULL ELSE 
            grupoglosainfra_det 
            END) END 
            AS grupoglosainfra_det_union,
    CASE WHEN grupocodigoinfra_det = '99' THEN 
        ( CASE WHEN grupocodigoinfra2_det = '99' 
            THEN NULL ELSE 
            grupocodigoinfra2_det
            END) ELSE 
        ( CASE WHEN grupocodigoinfra_det = '99' 
            THEN NULL ELSE 
            grupocodigoinfra_det
            END) END 
            AS grupocodigoinfra_det_union,
    CASE WHEN grupocodigonormainfra2_det = '99'
        THEN NULL ELSE
        grupocodigonormainfra2_det
        END AS grupocodigonormainfra2_det
    FROM raw.inspections_complete
    LIMIT 4;""".format(role)
plsql.query(qry, engine)

# Inspections cleaned

In [None]:
qry = """SET ROLE {};
    SELECT *
    FROM cleaned.inspections_complete
    LIMIT 4;""".format(role)
plsql.query(qry, engine)

### Matters inspected

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodtipomaterias")

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodtipomaterias2")

In [None]:
lceda.description_colname(engine, role, "cleaned", "inspections_complete", "grupocodtipomaterias_union")

In [None]:
lceda.description_colname(engine, role, "cleaned", "inspected_matters", "matter_code")

### Matters infracted

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodigoinfra")

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodigoinfra2")

In [None]:
lceda.description_colname(engine, role, "cleaned", "inspections_complete", "grupocodigoinfra_union")

In [None]:
lceda.description_colname(engine, role, "cleaned", "infracted_matters", "matter_code")

### Matters infracted detailed

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodigoinfra_det")

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodigoinfra2_det")

In [None]:
lceda.description_colname(engine, role, "cleaned", "inspections_complete", "grupocodigoinfra_det_union")

In [None]:
lceda.description_colname(engine, role, "cleaned", "infracted_matters_detailed", "matter_code")

### Matters infracted updated book

In [None]:
lceda.description_colname(engine, role, "raw", "inspections_complete", "grupocodigonormainfra2_det")

In [None]:
lceda.description_colname(engine, role, "cleaned", "inspections_complete", "grupocodigonormainfra2_det_union")

In [None]:
lceda.description_colname(engine, role, "cleaned", "infracted_matters_updatedbook", "matter_code")