# Data Story: Company `a26f86c6ef...05c0750`

In [None]:
import sys
sys.path.append('../')

import pipeline.sql as plsql
import eda.functions_datastory as funsds

import pandas as pd
from plotnine import *

In [None]:
engine = plsql.create_engine('../config.yaml')
role = 'set role direccion_trabajo_inspections_write;'

Let's look at the company with the most inspections.

In [None]:
plsql.query("""{}
select rutempresamask, count(*) from raw.inspections_complete group by rutempresamask
order by count desc limit 5;""".format(role), engine)

The top company has had **7,639 inspections** in **ten years**.

In [None]:
company = 'a26f86c6ef9fbb212858c435ae79a9a54546b9043def5e12907852f6c05c0750'

In [None]:
plsql.query("""{}
select sum(infra) as infractions, sum(cast(infractor as int)) as infractor, 
sum(case when solicitante = 'Por Programa' then 1 else 0 end) as proactive,
sum(cast(urgencia as int)) as urgencia,
sum(num_materias) as matters
from raw.inspections_complete
where rutempresamask = '{}';
""".format(role, company), engine)

In [None]:
plsql.query("""{} select crae, grae, ccae, gcae, num_sind, count(*)
from raw.inspections_complete where rutempresamask = '{}' group by crae, grae, ccae, gcae, num_sind
order by count desc;""".format(role, company), 
            engine)

There's some variation in the industry codes used to describe the economic activities of this company, but the most common code is "TRANSPORTE INTERURBANO DE PASAJEROS VÍA AUTOBUS" or "INTERIOR TRANSPORTATION OF PASSENGERS VIA BUS". Other common codes include "URBAN TRANSPORTATION OF PASSENGERS VIA BUS (COLLECTIVE LOCOMOTION)", "OTHER TYPES OF REGULAR TRANSPORTATION OF PASSENGERS BY TERRESTRIAL ROAD N.C.P.", and "CARGO FREIGHT TRANSPORTATION". 

So, it appears that this company is a major corporation involved in **transportation of passengers and goods** and it is involved with many **unions**.

One other think I noticed is that many inspections are labeled as "Activities not specified". Is this a result of laziness by inspectors, or versatility by this company?

In [None]:
df = plsql.query("""{} select cast(region as int), count(*)
from raw.inspections_complete where rutempresamask = '{}' group by region order by region;""".format(role, company), 
            engine)

In [None]:
(ggplot(df, aes('region', 'count') ) +
    geom_bar(stat = 'identity', fill = "purple") +
    scale_x_continuous(breaks = range(0, 16)) +
    theme_bw() + 
    #coord_flip() +
    ggtitle("Number of Inspections by Region"))

This company has locations all over Chile except in regions XI and XII, which are in the far south and the least populous of all of Chile's regions. There are the greatest number of inspections in regions II, V, XIII, and RM. Regions V, XIII, and RM are the most populous in Chile.

In [None]:
df = plsql.query("""{} select agno, sum(infra) as infra, sum(num_materias) as num_matters, 
sum(cast(infractor as int)) as infractor, count(*)
from raw.inspections_complete where rutempresamask = '{}' group by agno order by agno;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'count')) + 
    geom_point() + 
    geom_line() + 
    geom_point(aes('agno', 'infractor'), color = 'red') +
    geom_line(aes('agno', 'infractor'), color = 'red') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of inspections over time"))

In [None]:
df = plsql.query("""{} select agno, region, sum(infra) as infra, sum(num_materias) as num_matters, 
sum(cast(infractor as int)) as infractor, count(*)
from raw.inspections_complete where rutempresamask = '{}' group by agno, region
order by agno, region;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'count')) + 
    #geom_point() + 
    geom_line() + 
    #geom_point(aes('agno', 'infractor'), color = 'red') +
    geom_line(aes('agno', 'infractor'), color = 'red') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region') +
    ggtitle("Number of inspections over time"))

In [None]:
df = plsql.query("""{} select agno, sum(infra) as infra, sum(num_materias) as num_matters, count(*)
from raw.inspections_complete where rutempresamask = '{}' and infractor = 'true' 
group by agno order by agno;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'num_matters')) + 
    geom_point() + 
    geom_line() + 
    geom_point(aes('agno', 'infra'), color = 'red') +
    geom_line(aes('agno', 'infra'), color = 'red') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of matters over time for inspections that result in infractions"))

We see that the number of inspections (and infractions) has decreased over time. Perhaps this company isn't as prevalent as it once was? Also, many inspections do not result in infractions. Maybe because this company transports passengers, it has many complaints.

In [None]:
df = plsql.query("""{} select agno, region, sum(infra) as infra, sum(num_materias) as num_matters, 
sum(cast(infractor as int)) as infractor, count(*)
from raw.inspections_complete where rutempresamask = '{}' group by agno, region 
order by agno, region;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'num_matters')) + 
    #geom_point() + 
    geom_line() + 
    #geom_point(aes('agno', 'infra'), color = 'red') +
    geom_line(aes('agno', 'infra'), color = 'red') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region') +
    ggtitle("Number of matters over time for inspections that result in infractions by region"))

In [None]:
df = plsql.query("""{} select agno, sum(case when solicitante = 'Por Programa' then 1 else 0 end) as proactive,
sum(case when solicitante = 'Por Programa' then 0 else 1 end) as reactive,
sum(infra) as infra, sum(num_materias) as num_matters, count(*)
from raw.inspections_complete where rutempresamask = '{}' 
group by agno order by agno;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'count')) + 
    geom_point() + 
    geom_line() + 
    geom_point(aes('agno', 'reactive'), color = 'red') +
    geom_line(aes('agno', 'reactive'), color = 'red') +
    geom_point(aes('agno', 'proactive'), color = 'blue') +
    geom_line(aes('agno', 'proactive'), color = 'blue') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of total and proactive inspections over time"))

This is really surprising! I expected that the large number of inspections would have been caused by customer complaints, because this seems to be a passenger transportation company. However, we see that the vast majority of inspections have been **proactive, not reactive** (with the exception of in 2010). So, something else is happening here regarding the relationship between this company and DT.

In [None]:
df = plsql.query("""{} select cast(region as int), 
(case when solicitante = 'Por Programa' then 'proactive' else 'reactive' end) as proactive,
sum(infra) as infra, sum(num_materias) as num_matters, count(*)
from raw.inspections_complete where rutempresamask = '{}' 
group by region, proactive order by region;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('region', 'count', fill = 'proactive')) +
    geom_col(stat = 'identity', position = 'dodge') +
    scale_x_continuous(breaks = range(0, 16)) +
    theme_bw() + 
    #coord_flip() +
    ggtitle("Number of Inspections by Region"))

In [None]:
df = plsql.query("""{} select agno, region, sum(case when solicitante = 'Por Programa' then 1 else 0 end) as proactive,
sum(case when solicitante = 'Por Programa' then 0 else 1 end) as reactive,
sum(infra) as infra, sum(num_materias) as num_matters, count(*)
from raw.inspections_complete where rutempresamask = '{}' 
group by agno, region order by agno, region;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'count')) + 
    geom_line() + 
    geom_line(aes('agno', 'reactive'), color = 'red') +
    geom_line(aes('agno', 'proactive'), color = 'blue') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region') +
    ggtitle("Number of total and proactive inspections over time by region"))

In [None]:
df = plsql.query("""{} select agno, sum(cast(urgencia as int)) as urgencia, sum(derechofund) as derechofund,
sum(infra) as infra, sum(num_materias) as num_matters, count(*)
from raw.inspections_complete where rutempresamask = '{}' 
group by agno order by agno;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'count')) + 
    geom_point() + 
    geom_line() + 
    geom_point(aes('agno', 'urgencia'), color = 'red') +
    geom_line(aes('agno', 'urgencia'), color = 'red') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Inspections that are urgent over time"))

In [None]:
df = plsql.query("""{} select agno, region, sum(cast(urgencia as int)) as urgencia, sum(derechofund) as derechofund,
sum(infra) as infra, sum(num_materias) as num_matters, count(*)
from raw.inspections_complete where rutempresamask = '{}' 
group by agno, region order by agno;""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'count')) + 
    geom_line() + 
    geom_line(aes('agno', 'urgencia'), color = 'red') +
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    facet_wrap('~region') +
    ggtitle("Inspections that are urgent over time"))

In [None]:
plsql.query("""{} select cast(matter_code as int), count(*)
from cleaned.violations where rutempresamask = '{}' and matter_code != '99' group by matter_code
order by count desc;""".format(role, company), engine)

The most common non-null violation code is 10, which corresponds with "JORNADA Y DESCANSOS" or "DAY AND BREAKS", which presumably is related to workers getting appropriate hours. Other common violations include "REMUNERATIONS" and "HYGIENE AND SECURITY". 

In [None]:
plsql.query("""{} select cast(matter_code as int), count(*)
from cleaned.complaints where rutempresamask = '{}' and matter_code != '99' group by matter_code
order by count desc;""".format(role, company), engine)

The matters most often complained about are "DAY AND BREAKS", "HYGIENE AND SECURITY" and "REMUNERATIONS". These are the same as the matters violated, but in a different order.

In [None]:
plsql.query("""{} select cast(matter_code as int), count(*)
from cleaned.inspected_matters where rutempresamask = '{}' and matter_code != '99' group by matter_code
order by count desc;""".format(role, company), engine)

The most common inspected matters are "REMUNERATIONS", "THE WORKING DAY", "INDIVIDUAL WORK CONTRACT", and "PROTECTION OF LIFE AND HEALTH OF WORKERS". Across the violated, complained, and inspected matters, there is a focus on worker rights and conditions. This is consistent with my idea of a bus company, as I'd expect bus drivers to work odd hours with possibly short-term contracts.

In [None]:
df = plsql.query("""{} select agno, ntrabajadores, actividadeconomica 
from raw.taxes where rutmask = '{}'""".format(role, company), engine)

In [None]:
(ggplot(df, aes('agno', 'ntrabajadores')) + 
    geom_point() + 
    geom_line() + 
    theme_bw() + 
    theme(axis_text_x=element_text(angle=90)) +
    ggtitle("Number of employees over time") + 
    ylim((0, 8700))
)

The number of employees over time for this company gives results consistent with previous assumptions: it's a large company that employs many people, and it seems like its prevalence has been declining slightly over the past ten years (the company has shrunk by around 1000 employees).

In [None]:
eds.time_graphs(engine, company)

In [None]:
plsql.query("{} select calle, numero from raw.taxes where rutmask = '{}';".format(role, company), engine)

## Functions Data Story

In [None]:
role = 'direccion_trabajo_inspections_write'
id_company = 'a26f86c6ef9fbb212858c435ae79a9a54546b9043def5e12907852f6c05c0750'
id_company

In [None]:
funsds.facility_info(engine = engine, id_company = id_company)

In [None]:
funsds.time_graphs(engine = engine, id_company = id_company)

In [None]:
tab_result = funsds.freq_matters_bookupdated(engine, role, id_company)
tab_result.head()

In [None]:
funsds.text_wordcloud_matters(tab_result)