# Deep Dive

In [None]:
import sys
sys.path.append('../')

import pipeline.sql as plsql
import pipeline.eda as pleda
import eda.functions_eda as lceda


from plotnine import *
import pandas as pd
from mizani.breaks import date_breaks
from mizani.formatters import date_format
from datetime import date

theme_set(theme_bw())

In [None]:
def custom_date_format2(breaks):
    """
    Function to format the date
    """
    res = []
    for x in breaks:
        # First day of the year
        if x.month == 1 and x.day == 1:
            fmt = '%Y'
        # Every other month
        elif x.month % 2 != 0:
            fmt = '%b'
        else:
            fmt = ''

        res.append(date.strftime(x, fmt))

    return res

def custom_date_format3(breaks):
    """
    Function to format the date
    """
    res = []
    for x in breaks:
        # First day of the year
        if x.month == 1:
            fmt = '%Y'
        else:
            fmt = '%b'

        res.append(date.strftime(x, fmt))

    return res


def custom_date_breaks(width=None):
    """
    Create a function that calculates date breaks

    It delegates the work to `date_breaks`
    """
    def filter_func(limits):
        breaks = date_breaks(width)(limits)
        # filter
        return [x for x in breaks if x.month % 2]

    return filter_func

In [None]:
engine = plsql.create_engine('../config.yaml')
role = 'direccion_trabajo_inspections_write'

In [None]:
qry = """set role {}; 
        select agno, count( distinct idfiscalizacion)
        from cleaned.inspections_complete
        group by agno;""".format(role)
tab = plsql.query(qry, engine)

In [None]:
print( tab['count'].mean() )
print( tab[tab['agno']< 2016]['count'].mean() )

In [None]:
qry = """set role {}; 
        select idfiscalizacion, count( * )
        from cleaned.inspected_matters
        group by idfiscalizacion;""".format(role)
tab = plsql.query(qry, engine)
print( tab['count'].describe() )

In [None]:
qry = """set role {}; 
        select idfiscalizacion, count( * )
        from cleaned.infracted_matters_detailed
        group by idfiscalizacion;""".format(role)
tab = plsql.query(qry, engine)
print( tab['count'].describe() )

In [None]:
qry = """set role {}; 
        select idfiscalizacion, count( * )
        from cleaned.infracted_matters_updatedbook
        group by idfiscalizacion;""".format(role)
tab = plsql.query(qry, engine)
print( tab['count'].describe() )

In [None]:
qry = """set role {}; 
        select avg(num_materias)
        from cleaned.inspections_complete;""".format(role)
plsql.query(qry, engine)

In [None]:
qry = """set role {}; 
        select avg(infra)
        from cleaned.inspections_complete;""".format(role)
plsql.query(qry, engine)

In [None]:
qry = """set role {}; 
        select avg(infra/num_materias)
        from cleaned.inspections_complete;""".format(role)
plsql.query(qry, engine)

In [None]:
qry = """set role {}; 
        select *
        from raw.taxes
        limit 4;""".format(role)
plsql.query(qry, engine)

In [None]:
qry = """set role {}; 
        select agno, count( distinct rutempresamask ) as num_facility
        from cleaned.inspections_complete
        group by agno;""".format(role)
tab_insp = plsql.query(qry, engine)

In [None]:
tab_insp

In [None]:
qry = """set role {}; 
        select agno, count( distinct rutmask ) as num_companies
        from raw.taxes
        group by agno;""".format(role)
tab_tax = plsql.query(qry, engine)

In [None]:
tab_tax['num_companies'].describe()

In [None]:
tab = tab_insp.merge(tab_tax, left_on='agno', right_on='agno')
tab[tab['agno'] < 2016]['prop'].mean()

### Number of infractions/inspections

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select agno, sum( case when infra = 0 then 1 else 0 end) as num_infra_zero,
    count (*) as num_insp, 
    sum(case when infra > 0 then 1 else 0 end) as num_infra_pos
    from cleaned.inspections_complete
    group by agno;"""
tab = plsql.query(qry, engine)
tab

In [None]:
print( round(100*(tab['num_infra_zero']/tab['num_insp']).mean() ))
print( (tab['num_infra_pos']/tab['num_insp']).mean() )

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select agno, count (*) as num_insp_zero
    from cleaned.inspections_complete
    where infra = 0
    group by agno;"""
tab_zero = plsql.query(qry, engine)
tab_zero

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select agno, sum(infra) as num_infra, count (*) as num_insp
    from cleaned.inspections_complete
    where infra > 0
    group by agno
    order by num_infra desc;"""
tab = plsql.query(qry, engine)
tab

In [None]:
tab['ratio'] = tab['num_infra']/tab['num_insp']
tab['ratio_lab'] = round(tab['num_infra']/tab['num_insp'], 1)
tab

In [None]:
tab.ratio.mean()

In [None]:
(ggplot(tab, aes(x = 'agno')) + 
 geom_linerange(aes(ymin = 1, ymax = 'ratio'), size = 4) + 
 geom_hline(yintercept = 1, color = "#e91d63", size = 1) + 
 ylab('infractions / inspections') + 
 xlab('year') + 
 theme(axis_text_x = element_text(rotation = 90, vjust = 1, hjust = 1),
       figure_size = (4, 3)))

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select region, count(*)
    from cleaned.inspections_complete
    group by region;"""
tab = plsql.query(qry, engine)

In [None]:
tab['region'] = tab['region'].astype('int')
tab['santiago'] = tab['region'] == 13
tab

In [None]:
(ggplot(tab, aes(x = 'region')) + 
 geom_linerange(aes(ymin = 1, 
                    ymax = 'count', 
                    color = 'santiago'), size = 4) + 
 ylab('inspections') + 
 xlab('region') + 
 scale_color_manual(values = ("gray", "#e91d63"),
                   guide = False) +
 scale_x_continuous(breaks = range(0, 17)) + 
 theme(figure_size = (4, 3)))

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select date(agno || '-' || mesreg || '-01') as date,
    count(*) as inspections, 
    sum(infra)  as infractions,
    sum(num_mateias) as matters,
    sum(case when solicitante = 'Por Programa' then 1 else 0 end) as proactive,
    sum(case when solicitante = 'Por Programa' then 0 else 1 end) as reactive
    from raw.inspections_complete
    group by date;"""

In [None]:
tab = plsql.query(qry, engine)

In [None]:
tab.dtypes

In [None]:
tab['date'] = pd.to_datetime(tab.date)
tab.head()

In [None]:
tab['october'] = tab['date'] == '2014-10-01'

In [None]:
(ggplot(tab, aes(x = 'date', y = 'inspections', fill = 'october')) + 
    geom_bar(stat = 'identity') + 
    scale_x_datetime(
         breaks=date_breaks('1 months'),
         labels=custom_date_format2) +
    scale_fill_manual(values = ("gray", "#e91d63"),
                   guide = False) +
    xlab("month year") +
    theme(axis_text_x = element_text(angle = 90, size= 6),
         figure_size = (11, 3)))

In [None]:
(ggplot(tab, aes(x = 'date', y = 'proactive')) + 
    geom_bar(stat = 'identity') + 
    scale_x_datetime(
         breaks=date_breaks('1 months'),
         labels=custom_date_format2) +
    xlab("month year") +
    theme(axis_text_x = element_text(angle = 90, size= 6),
         figure_size = (11, 3)))

In [None]:
tab.proactive.mean()

In [None]:
import datetime

In [None]:
tab['year'] = [x.year for x in tab['date']]
tab['month'] = [x.month for x in tab['date']]
tab.head()

In [None]:
def my_agg(x):
    names = {
        'mean proactive': x['proactive'].mean(),
        'mean reactive': x['reactive'].mean(),
        'mean inspections': x['inspections'].mean()}
    return pd.Series(names, index=['mean proactive', 
                                   'mean reactive', 
                                   'mean inspections'])

In [None]:
tab.groupby('year').apply(my_agg).reset_index().sort_values('mean proactive')

In [None]:
tt = tab.groupby('month').apply(my_agg).reset_index()
(ggplot(tt, aes(x = 'month', y = 'mean proactive')) + 
        geom_bar(stat = 'identity') + 
        scale_x_continuous(breaks = range(1, 13)))

In [None]:
(ggplot(tab, aes(x = 'month', y = 'proactive', fill = 'year')) + 
        geom_bar(stat = 'identity') + 
        scale_x_continuous(breaks = range(1, 13)))

In [None]:
tt