Social Security 

In [None]:
import sys
sys.path.append('../')

import pipeline.sql as plsql
import pipeline.eda as pleda

import eda.functions_datastory as edads

import numpy as np
import pandas as pd
from plotnine import *

engine = plsql.create_engine('../config.yaml')
role = 'direccion_trabajo_inspections_write'

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select *
    from cleaned.social
    limit 3;"""
df = plsql.query(qry, engine)
df.columns

In [None]:
tt = pleda.total_rows(engine, role, 'cleaned', 'social')

In [None]:
# Distinct values
vars_selec = ['razonsocialmask', 'rutempleadormask', 'nombredelafiliadomask']
for i in vars_selec:
    print(pleda.count_distinct(engine, role, 'cleaned', 'social', i))

In [None]:
for i in vars_selec:
    print(pleda.count_nulls(engine, role, 'cleaned', 'social', i))

In [None]:
# Compared to taxes
print(pleda.count_distinct(engine, role, 'raw', 'taxes', 'razonsocialmask'))
print(pleda.count_distinct(engine, role, 'raw', 'taxes', 'rutmask'))

In [None]:
# Compared to inspections
print(pleda.count_distinct(engine, role, 'raw', 'inspections_complete', 'rutempresamask'))

In [None]:
pleda.proportion_nulls(engine, role, 'raw', 'social', 'administradora')

In [None]:
for i in df.columns:
    tt = pleda.rows_by_group(engine, role, 'cleaned', 'social', i)
    print( tt.dtypes )
    print( tt )

In [None]:
df_propnull = pleda.proportion_nulls_all_columns(engine, role, 'cleaned', 'social')
(ggplot(df_propnull, aes(x = 'column_name', y = '1', fill = 'proportion')) + 
    geom_tile() + 
    theme(axis_text_x = element_text(angle = 90),
          figure_size = (7, 2)) + 
    ggtitle("Social Security Dataset"))

In [None]:
df_propnull = pleda.proportion_nulls_all_columns(engine, role, 'raw', 'taxes')
(ggplot(df_propnull, aes(x = 'column_name', y = '1', fill = 'proportion')) + 
    geom_tile() + 
    theme(axis_text_x = element_text(angle = 90),
          figure_size = (6, 2))+ 
    ggtitle("Taxes Dataset") )

In [None]:
df_propnull = pleda.proportion_nulls_all_columns(engine, role, 'raw', 'inspections_complete')
(ggplot(df_propnull, aes(x = 'column_name', y = '1', fill = 'proportion')) + 
    geom_tile() + 
    theme(axis_text_x = element_text(angle = 90),
          figure_size = (10, 2)) + 
    ggtitle("Inspections Complete Dataset"))

In [None]:
df_propnull[df_propnull['column_name'] == 'fechapago']

Union de tablas

In [None]:
def rutmask_fun(mask, schema, table):
    qry = """set role direccion_trabajo_inspections_write;
    select {}, count(*) 
    from {}.{} 
    group by {};""".format(mask, schema, table, mask)
    result = plsql.query(qry, engine)
    return result

In [None]:
# Social
massoc_soc = rutmask_fun('razonsocialmask', 'cleaned', 'social')
masemp_soc = rutmask_fun('rutempleadormask', 'cleaned', 'social')
masafi_soc = rutmask_fun('rutafiliadomask', 'cleaned', 'social')
masnom_soc = rutmask_fun('nombredelafiliadomask', 'cleaned', 'social')

# Taxes
massoc_tax = rutmask_fun('razonsocialmask', 'raw', 'taxes')
masrut_tax = rutmask_fun('rutmask', 'raw', 'taxes')

# Inspections
masemp_insp = rutmask_fun('rutempresamask', 'raw', 'inspections_complete')

In [None]:
print(massoc_soc.shape)
print(masemp_soc.shape)
print(masafi_soc.shape)
print(masnom_soc.shape)

Three datasets joined

In [None]:
tt = masemp_soc.merge(masrut_tax, left_on='rutempleadormask', right_on='rutmask')
tt2 = tt.merge(masemp_insp, left_on='rutmask', right_on='rutempresamask')

In [None]:
tt2.shape

Pairs merged

In [None]:
# Social - Taxes
print(massoc_soc.merge(massoc_tax, left_on='razonsocialmask', right_on='razonsocialmask').shape)
print(masemp_soc.merge(massoc_tax, left_on='rutempleadormask', right_on='razonsocialmask').shape)
print(masafi_soc.merge(massoc_tax, left_on='rutafiliadomask', right_on='razonsocialmask').shape)
print(masnom_soc.merge(massoc_tax, left_on='nombredelafiliadomask', right_on='razonsocialmask').shape)

In [None]:
print(massoc_soc.merge(masrut_tax, left_on='razonsocialmask', right_on='rutmask').shape)
print(masemp_soc.merge(masrut_tax, left_on='rutempleadormask', right_on='rutmask').shape)
print(masafi_soc.merge(masrut_tax, left_on='rutafiliadomask', right_on='rutmask').shape)
print(masnom_soc.merge(masrut_tax, left_on='nombredelafiliadomask', right_on='rutmask').shape)

In [None]:
# Social - Inspections
print(massoc_soc.merge(masemp_insp, left_on='razonsocialmask', right_on='rutempresamask', how='inner').shape)
print(masemp_soc.merge(masemp_insp, left_on='rutempleadormask', right_on='rutempresamask', how='inner').shape)
print(masafi_soc.merge(masemp_insp, left_on='rutafiliadomask', right_on='rutempresamask', how='inner').shape)
print(masnom_soc.merge(masemp_insp, left_on='nombredelafiliadomask', right_on='rutempresamask', how='inner').shape)

In [None]:
print(masemp_insp.shape)
print(masrut_tax.shape)
print(masemp_insp.merge(masrut_tax, left_on='rutempresamask', right_on='rutmask', how='inner').shape)
print(masemp_insp.merge(massoc_tax, left_on='rutempresamask', right_on='razonsocialmask', how='inner').shape)

Story of one company

In [None]:
# tt2["rutmask"][1802]
# id_company = 'ab19b49ba5d811743b7c2239014ef41c9164fa0b931e950f6358a00925791597'
# id_company = tt2["rutmask"][9262]
id_company = '4b2b6b965944e652b08af06b031e020ce5dc4b66598a18c53e0ca9e8a9fa1322'
id_company

In [None]:
edads.facility_info(engine, id_company)

In [None]:
edads.time_graphs(engine, id_company)

In [None]:
edads.economicactivity_info(engine, id_company)