In [None]:
import sys
sys.path.append('../')

import pipeline.sql as plsql
import pipeline.eda as pleda

from plotnine import *
import pandas as pd

theme_set(theme_bw())

In [None]:
engine = plsql.create_engine('../config.yaml')
role = 'direccion_trabajo_inspections_write'

## Infractions by inspection

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select agno, count(*) as count, sum( infra > 0) as inspections
    from cleaned.inspections_se
    group by agno
    where infra >= 0
    order by inspections desc;"""

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select agno, count(*) as inspections,
        sum( cast(infra > 0 as int)  ) as infractions
    from raw.inspections_complete
    group by agno;"""

In [None]:
tab = plsql.query(qry, engine)
tab['prop'] = tab['infractions']/tab['inspections']
tab['prop_rnd'] = 100*round(tab['prop'], 2)
tab['prop_rnd'] = tab['prop_rnd'].astype('int')
tab['prop_rnd'] = tab['prop_rnd'].map(str) + "%"

tab

In [None]:
(ggplot(tab, aes('agno', 'prop')) + 
       geom_bar(stat = 'identity', alpha = .5) + 
         geom_text(aes(y = 'prop + .04', 
                      label = 'prop_rnd'),
                  size = 10)+
         scale_x_continuous(breaks = range(2005, 2017)) +
         ylab('Violations/Inpsections') + 
         xlab('Year') +
        theme(figure_size = (6, 3.5)))

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select count(*) as inspections,
        sum( cast(infra > 0 as int)  ) as infractions
    from raw.inspections_complete;"""

In [None]:
tab = plsql.query(qry, engine)
tab['prop'] = tab['infractions']/tab['inspections']
tab

## Funciones Precision and Recall

In [None]:
def avg_fun(x):
    names = {'average': x['value'].mean()}
    return pd.Series(names, index=['average'])

In [None]:
def gg_tab_modelid(tab_gg, type_gg, type_model, label_name):
    gg1 = (ggplot(tab_gg,
                  aes('popul', 'value',
                    color = 'model_id',
                    group = 'model_id')) + 
        geom_line(alpha = .9) + 
        ylab('Precision') + 
        xlab('Population\n({})'.format(type_gg)) + 
        ggtitle(type_model+ '\n' + label_name) +
        theme(figure_size = (5, 3)))
    gg2 = (ggplot(tab_gg, 
                  aes('popul', 'value', 
                      color = 'last_model',
                      group = 'model_id')) + 
        geom_line() + 
        scale_color_manual(values = ("#d8d8d8", '#e91d63'),
                           guide = False)+
        ylab('Precision') + 
        xlab('Population\n({})'.format(type_gg)) + 
        ggtitle(type_model+ '\n' + label_name) +
        theme(figure_size = (5, 3)) )
    
    gg3 = (ggplot(tab_gg[tab_gg['last_model']], 
                  aes('popul', 'value',
                      group = 'model_id')) + 
        geom_line(color = "#2c3571", size = 2) + 
        ylab('Precision Last ') + 
        xlab('Population\n({})'.format(type_gg)) + 
        ggtitle(type_model+ '\n' + label_name) +
        theme(figure_size = (5, 3)) )
    
    tab_avg = tab_gg.groupby(['metric', 'popul', 'type']).apply(avg_fun).reset_index()
    gg4 = (ggplot(tab_avg, 
                  aes('popul', 'average',
                      group = 'type')) + 
        geom_line(color = "#2c3571", size = 2) + 
        ylab('Precision Avg') + 
        xlab('Population\n({})'.format(type_gg)) + 
        ggtitle(type_model+ '\n' + label_name) +
        theme(figure_size = (5, 3)) )
    
    
    return gg1, gg2, gg3, gg4

In [None]:
def describe_model(model_id, label_name):
    
    qry = """set role {}; 
        select * 
        from results.models
        where model_group_id = {};""".format(role, model_id)
    tab_gpmod = plsql.query(qry, engine)
    
    print('Model type: ' + str(tab_gpmod.model_type.unique()) )
    print('Running time: ' + str(tab_gpmod.run_time.max() - tab_gpmod.run_time.min()) )
    print('Models: ' + str(len(tab_gpmod.model_id)))
    
    
    qry = """set role {}; 
        select * 
        from results.evaluations;""".format(role)
    df = plsql.query(qry, engine)
    df = df.merge(tab_gpmod, how='inner', left_on='model_id', right_on='model_id').reset_index(drop = True)
    
    df['metric'], df['popul'] = zip(*df['metric'].map(lambda x: x.split('|')))
    df['popul'], df['type'] = zip(*df['popul'].map(lambda x: x.split('_')))
    df.popul = df.popul.astype('float')
    
    tab_met = df.merge(tab_gpmod, how='left', left_on='model_id', right_on='model_id')
    tab_met['last_model'] = (tab_met['model_id'] == tab_met.model_id.max())
    
    gg1, gg2, gg3, gg7 = gg_tab_modelid(tab_gg = tab_met[(tab_met['type'] == 'abs') & 
                                               (tab_met['metric'] == 'recall_one')], 
                              type_gg = 'Absolute', 
                              type_model = str(tab_gpmod.model_type.unique()),
                              label_name = label_name)
    gg4, gg5, gg6, gg8 = gg_tab_modelid(tab_gg = tab_met[(tab_met['type'] == 'pct') & 
                                               (tab_met['metric'] == 'recall_one')], 
                              type_gg = 'Percentage', 
                              type_model = str(tab_gpmod.model_type.unique()),
                              label_name = label_name)
    print(gg4)
    print(gg5)
    print(gg6)
    print(gg8)

In [None]:
qry = """set role {}; 
        select * from results.model_groups;
        """.format(role)
tab_model_group = plsql.query(engine=engine, qry=qry)
tab_model_group

In [None]:
tab_model_group.iloc[2,2]

In [None]:
describe_model(model_id = '8', label_name = 'P(violation|inspection)')

In [None]:
describe_model(model_id = '4', label_name = 'P(violation|inspection)')

In [None]:
describe_model(model_id = '1', label_name = 'P(inspection)')

In [None]:
describe_model(model_id = '6', label_name = 'P(inspection)')

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
tab_met[(tab_met['type'] == 'abs') & 
        (tab_met['metric'] == 'precision')]

In [None]:
model_id = '3'
qry = """set role {}; 
        select * 
        from results.models
        where model_group_id = {};""".format(role, model_id)
tab_gpmod = plsql.query(qry, engine)

qry = """set role {}; 
    select * 
    from results.evaluations;""".format(role)
df = plsql.query(qry, engine)
df = df.merge(tab_gpmod, how='inner', left_on='model_id', right_on='model_id').reset_index(drop = True)

df['metric'], df['popul'] = zip(*df['metric'].map(lambda x: x.split('|')))
df['popul'], df['type'] = zip(*df['popul'].map(lambda x: x.split('_')))
df.popul = df.popul.astype('float')

tab_met = df.merge(tab_gpmod, how='left', left_on='model_id', right_on='model_id')
tab_met['last_model'] = (tab_met['model_id'] == tab_met.model_id.max())

In [None]:
tab_met.head()

In [None]:
tab_avg = tab_met.groupby(['metric', 'popul', 'type']).apply(avg_fun).reset_index()
gg4 = (ggplot(tab_avg, 
              aes('popul', 'average',
                  group = '1')) + 
    geom_line(color = "#2c3571", size = 2) + 
    ylab('Precision Avg'))

In [None]:
gg4

In [None]:
import pipeline.metrics as plmet

scores = tab.score
y_true = tab.label_value

In [None]:
df = (pd.DataFrame(data = {'scores' : scores,
                               'y_true' : y_true}).
          sort_values('scores', ascending=False).
          reset_index(drop = True))   
df.head()

In [None]:
tab = plmet.precision(scores, y_true, 9000,'threshold_k', 'pessimist')

In [None]:
tab

In [None]:
import numpy as np

threshold = 4000
class_type = "threshold_k"


# absolute range 
step = np.floor(threshold/100)

# percentage range 
pct_rng = list(np.arange(.1, 1.1, 0.1))    

# dictionary of types of cutoff
cuts = { 'pct': pct_rng}

# dictionary of metrics
all_metrics = dict()


for x_type, x_values in cuts.items():
    for x_value in x_values:


        # cutoff to number
        cutoff = plmet.generate_cutoff_at_x(scores, x_value, unit = x_type)
        print(cutoff)
        
        # precision 1's
        all_metrics["precision_one|{}_{}".format(str(x_value), x_type)] = plmet.precision(scores, y_true, cutoff, class_type, 'optimist')

        # recall 1's
        all_metrics["recall_one|{}_{}".format(str(x_value), x_type)] = plmet.recall(scores, y_true, cutoff, class_type, 'optimist')                                                                   

        # precision 0's                                                                      
        all_metrics["precision_zero|{}_{}".format(str(x_value), x_type)] = plmet.precision(scores, y_true, cutoff, class_type, 'pessimist')

        # recall 0's
        all_metrics["recall_zero|{}_{}".format(str(x_value), x_type)] = plmet.recall(scores, y_true, cutoff, class_type, 'pessimist')
        
        all_metrics["fallout_one|{}_{}".format(str(x_value), x_type)] = plmet.fallout(scores, y_true, cutoff, class_type, 'optimist')
        
        all_metrics["fallout_zero|{}_{}".format(str(x_value), x_type)] = plmet.fallout(scores, y_true, cutoff, class_type, 'pessimist')
        

In [None]:
import pandas as pd
df = pd.DataFrame({ "metric" : list(all_metrics.keys()),
              "scores" : list(all_metrics.values())})

In [None]:
df.head()

In [None]:
df['metric'], df['popul'] = zip(*df['metric'].map(lambda x: x.split('|')))
df['popul'], df['type'] = zip(*df['popul'].map(lambda x: x.split('_')))
df.popul = df.popul.astype('float')

In [None]:
df.head()

In [None]:
(ggplot(df, aes('popul', 'scores',
                group = 'metric', 
               color = 'metric')) + 
        geom_line() + 
        ylab('Precision') + 
        xlab('Population\n({})') + 
        theme(figure_size = (5, 3))) #+ 
        #facet_wrap('~metric', scales = 'free'))

In [None]:
qry = """set role direccion_trabajo_inspections_write;
    select * from results.predictions where 
    matrix_uuid = 'a19c668d-573f-4a45-3768-eaeba20ab3cb';;"""
tab = plsql.query(qry, engine)
tab.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# matplotlib histogram
plt.hist(tab['score'], edgecolor = 'black',
         bins = int(180/5))

# seaborn histogram
sns.distplot(tab['score'], hist=True, kde=False, 
             bins=int(180/5), 
             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')

In [None]:
(ggplot(tab, aes(x = 'scores') ) + 
    geom_histogram(bins = 30) + 
    facet_wrap('~label_value'))