# Orizon: Raio-X

Vamos entender como os serviços se relacionam entre si computando a frequencia relativa à cada um deles por ID conta e a frequência de agrupamentos de serviços por ID conta.

Será utilizado Regras de Associação e o agrupamento feito pelo algoritmo baseado no Eclat(Equivalent Class Transformation).

In [1]:
import pandas as pd
import numpy as np
import pickle
from itertools import combinations
from IPython.display import display
from collections import OrderedDict

Lendo os dados para a dataframe:

In [2]:
df = pd.read_csv("../full_extracts/data_30735068.csv", delimiter="\t", encoding = "ISO-8859-1", error_bad_lines=False,warn_bad_lines=False)
master = 30735068

  interactivity=interactivity, compiler=compiler, result=result)


Pré-Processamento do dataset completo e seleção dos dados que iremos utilizar.

In [3]:
df.servico = pd.to_numeric(df.servico,downcast='unsigned',errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(subset=['servico'],axis=0,inplace=True)
df.servico = pd.to_numeric(df.servico,downcast='unsigned',errors='coerce')
df.id_conta = pd.to_numeric(df.id_conta,downcast='unsigned',errors='coerce')
df.qtde = pd.to_numeric(df.qtde,downcast='unsigned',errors='coerce')
df.valor = pd.to_numeric(df.valor,downcast='float',errors='coerce')
df.tipo_item = df.tipo_item.astype('category')
df.descricao_despesa = df.descricao_despesa.astype('category')
selected_df = pd.concat([df["id_conta"], df["servico"], df["descricao_despesa"], df["tipo_item"],df["qtde"],df["valor"]], axis=1)
del df

In [4]:
selected_df.dtypes

id_conta               uint32
servico                uint64
descricao_despesa    category
tipo_item            category
qtde                  float64
valor                 float32
dtype: object

Vamos trabalhar apenas com os id_conta atrelados à mais de um serviço:

In [5]:
# Number of services per id_conta
count = selected_df.groupby("id_conta").servico.count()

In [6]:
count.head(10)

id_conta
1133618327     67
1133642991     86
1133672557      4
1133672799      4
1133674630     60
1133697412     78
1133710255     72
1133710421     65
1133762244     78
1133776771    106
Name: servico, dtype: int64

In [7]:
def get_count(acc_id):
    ''' Function to return number of services per id_count 
    Parameters:
    -----------
    Input:
        acc_id(integer): count id
    Return:
        count(integer): number of services
    '''
    return count.loc[acc_id]

In [8]:
%time selected_df["count"] = selected_df["id_conta"].map(get_count)

CPU times: user 16.2 s, sys: 9.41 ms, total: 16.2 s
Wall time: 16.3 s


In [9]:
# Dropping accounts with less than 1 service
selected_df = selected_df[selected_df["count"] > 1].drop("count", axis=1)

### Início da implementação do algoritmo Eclat de Regra de Associação:

In [10]:
def support(service,dataframe):
    """ Function that gets as inputs services and a dataframe, 
        gets the total number of accounts in it, and gets the accounts 
        that contain the specific services
    Parameters:
    -------------------
    Inputs:
        service (np.array): a array with all the services
        dataframe (pd.DataFrame): the DataFrame in which to look for the services
    Returns:
        The support, which is the total number of accounts containing each one of the services
    """
    acc_ids = dataframe[dataframe["servico"] == service[0]]["id_conta"].values
    for i in range(1,len(service)):
        acc_ids = dataframe[(dataframe["servico"] == service[i]) & (dataframe["id_conta"].isin(acc_ids))]["id_conta"].values
    return dataframe.loc[dataframe["id_conta"].isin(acc_ids)]["id_conta"].unique().shape[0]

In [11]:
def drop_comb(elements,group):
    """ Function that check if a combination contain all the unwanted elements in a list
    Parameters:
    -------------------
    Inputs:
        elements (np.array):a array with the unwanted elements 
        group (tuple): combination of elements
    Returns:
        A boolean indicating if the combinations has all the unwanted elements (False) or do not (True)
    """
    return len(list(filter(lambda x: x in elements, group))) != len(elements)

In [12]:
def combinations_test(iterable, r,drops):
    """ Function that combines the elements r-wise-way. 
    It uses some core elements from a built-in lib from python 
    called (itertools.combinations), and we just added the drops elements
    Parameters:
    --------------------
    Inputs:
        iterable (list): elements to combine
        r (integer): number of combinations
        drops (list): list of unwanted elements in a combination
    Returns:
        comb (list): combinations of elements r-wise way in a tuple, if not in the drops list
    """
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    indices = list(range(r))
    comb = tuple(pool[i] for i in indices)
    i=0
    for drop in drops:
        if drop_comb(drop,comb)==False:
            break;
        i=i+1
        if i==len(drops):
            yield comb
    while True:
        for i in reversed(range(r)):
            if indices[i] != i + n - r:
                break
        else:
            return
        indices[i] += 1
        for j in range(i+1, r):
            indices[j] = indices[j-1] + 1
        comb = tuple(pool[i] for i in indices)
        i=0
        for drop in drops:
            if drop_comb(drop,comb)==False:
                break;
            i=i+1
            if i==len(drops):
                yield comb

In [13]:
def eclat(servicos,dataframe,r=1,ids_drop=[],min_sup=10):
    ''' Function that combines services in a r-wise-way, calculates its support and return if higher than min_sup.
        Parameters:
        -----------
        Inputs:
            servicos (list): list of services to combine
            r (int): number of combinations
            ids_drop (list): list of combinations of services not wanted together
            min_sup (integer): mininum of support
        Return:
            ids (array): array of tuples of elements combinations
            serv_list (array): support of the ids combinations
            new_ids_drop (array): array of new combinatios of services unwanted
    '''
    serv_list=[]
    ids = []
    new_ids_drop = []
    # first case, calculating all the services supports and returning if higher than min_sup
    if(r==1):
        for service in servicos:
            supp = support([service],dataframe)
            if(supp>min_sup):
                serv_list.append(supp)
                ids.append(service)
        return ids,serv_list,ids_drop
    # others cases, combinating the services not contaning in the drops list and returning if higher than min_sup
    else:
        if len(ids_drop)>0:
            ids_2 = list(combinations_test(servicos,r,ids_drop))
        else:
            ids_2 = list(combinations(servicos,r))
            
        for service in ids_2:
            supp = support(service,dataframe)
            if(supp>min_sup):
                serv_list.append(supp)
                ids.append(service)
            else:
                new_ids_drop.append(service)
        return ids,serv_list,new_ids_drop
    

In [14]:
def eclat_iter(servicos,dataframe,min_sup):
    ''' Function that calls the eclat function for all r combinations and puts all the returns together.
        Parameters:
        -----------
        Inputs:
            servicos (list): list of all services to use eclat
            dataframe (pandas.DataFrame): dataframe containing the services and accounts
            min_sup (integer): mininum support
        Return:
            final_id (array): array of all elements and combinations used in the association
            final_serv (array): array of all the supports of final_id.    
    '''
    # calling eclat for the first case (all services and no combinations)
    ids, serv_list, ids_drop = eclat(servicos,dataframe,min_sup=min_sup)
    
    # iteration logic for the others cases
    i=2
    final_id = ids
    final_serv = serv_list
    new_serv = [1,1]
    while True:
        # breaking the loop in case the last eclat return only one or zero combinations
        if(len(new_serv)<2):
            break
       # doing each iteration and putting together in the final_id and final_serv
        else:
            new_ids, new_serv, ids_drop = eclat(servicos=ids,dataframe=dataframe,ids_drop=ids_drop,r=i,min_sup=min_sup)
            final_id = final_id + new_ids
            final_serv = final_serv + new_serv
            i = i + 1
            ids = []
            # logic to break the tuple of combinations into singles services to use in the new iteration
            for x in new_ids:
                for y in x:
                    if y not in ids:
                        ids.append(y)
    return final_id,final_serv


    

Retornando o número de contas equivalentes a 10% do total de contas, para utilizar como mínimo suporte

In [15]:
min_sup = int(0.1*support([master],selected_df))
min_sup

1297

### Eclat por grupo de itens:

Divindo o dataframe por grupo de itens:

In [16]:
group_items = list(selected_df.groupby('tipo_item'))

Realizando o eclat para cada grupo de itens e imprimindo o ultimo agrupamento e suporte para se ter uma noção do resultado.

In [17]:
group_ids = list(np.zeros([len(group_items),1]))
group_basket = list(np.zeros([len(group_items),1]))
for i in range(len(group_items)):
    print(group_items[i][0])
    %time group_ids[i],group_basket[i] = eclat_iter(group_items[i][1].servico.unique(),group_items[i][1],min_sup)
    if len(group_ids[i])==0:
        print('Sem agrupamento')
    else:
        print(group_ids[i][len(group_ids[i])-1])
        print(group_basket[i][len(group_ids[i])-1]*(100/support([master],selected_df)))

DIARIAS
CPU times: user 475 ms, sys: 1.06 ms, total: 476 ms
Wall time: 481 ms
Sem agrupamento
GASES MEDICINAIS
CPU times: user 642 ms, sys: 46 µs, total: 642 ms
Wall time: 645 ms
Sem agrupamento
MATERIAIS
CPU times: user 40.8 s, sys: 370 ms, total: 41.2 s
Wall time: 42 s
70705348
14.998073217726397
MEDICAMENTOS
CPU times: user 11.6 s, sys: 61 ms, total: 11.6 s
Wall time: 11.8 s
(90008332, 90003551, 90196031)
11.421965317919074
OPME
CPU times: user 6.21 s, sys: 933 µs, total: 6.21 s
Wall time: 6.25 s
Sem agrupamento
PROCEDIMENTO
CPU times: user 1.88 s, sys: 9.23 ms, total: 1.89 s
Wall time: 1.92 s
(30735068, 30735033, 30735092, 30735084)
10.504816955684008
TAXAS DIVERSAS
CPU times: user 4.83 s, sys: 26.4 ms, total: 4.85 s
Wall time: 4.93 s
60024151
11.552986512524084


Criando um dataframe baskets contendo os agrupamentos de serviços finais por grupo de itens e seus devidos suportes.

In [18]:
baskets = []
for i in range(len(group_items)):
    group_basket[i] = list(map(lambda x: x *(100/support([master],selected_df)), group_basket[i]))
    basket_dict = OrderedDict([("Grupos Servicos", group_ids[i]),("Suporte", group_basket[i])])
    basket_df = pd.DataFrame(basket_dict)
    baskets.append(basket_df)

Criando um dataframe groups contendo os detalhes de cada serviço que aparece no Market Basket por grupo de item.

In [19]:
groups = []
for i in range(len(group_items)):
    servicos = [ids for ids in group_ids[i] if type(ids) is not tuple]
    mean_vals = [np.array(group_items[i][1].loc[group_items[i][1]['servico'] == service].groupby('id_conta').valor.mean()).mean() for service in servicos]
    std_vals = [np.array(group_items[i][1].loc[group_items[i][1]['servico'] == service].groupby('id_conta').valor.mean()).std() for service in servicos]
    mean_qtt = [np.array(group_items[i][1].loc[group_items[i][1]['servico'] == service].groupby('id_conta').qtde.mean()).mean() for service in servicos]
    std_qtt = [np.array(group_items[i][1].loc[group_items[i][1]['servico'] == service].groupby('id_conta').qtde.mean()).std() for service in servicos]
    item_types = [group_items[i][0] for service in servicos]
    descriptions = [selected_df[selected_df["servico"] == service]["descricao_despesa"].values[10] for service in servicos]
    support_vals = [(support([service],group_items[i][1])*100/support([master],selected_df)) for service in servicos]
    group_dict = OrderedDict([("Servico", servicos), ("Descricao",descriptions), ("Tipo de Item",item_types), ("Quantidade média", mean_qtt), 
             ("Desvio Qtd", std_qtt), ("Valor médio", mean_vals), ("Desvio Valor", std_vals), ("Suporte",support_vals)])
    group_df = pd.DataFrame(group_dict)
    groups.append(group_df)

Salvando resultado em um pickle.

In [20]:
name = str(master)+'_items.pickle'
output = open(name, 'wb')
pickle.dump(([group_items[i][0] for i in range(len(group_items))],baskets,groups), output)
output.close()

Exemplo basket para os tipos de item sendo PROCEDIMENTOS:

In [21]:
baskets[5]

Unnamed: 0,Grupos Servicos,Suporte
0,30735017,14.851638
1,30735068,99.992293
2,30735092,29.071291
3,30717140,11.830443
4,30735033,70.011561
5,30735084,46.188825
6,30735041,10.489403
7,"(30735017, 30735068)",14.843931
8,"(30735017, 30735033)",11.229287
9,"(30735068, 30735092)",29.071291


Exemplo groups para os tipos de item sendo PROCEDIMENTOS:

In [22]:
groups[5].sort_values('Suporte', ascending=False)

Unnamed: 0,Servico,Descricao,Tipo de Item,Quantidade média,Desvio Qtd,Valor médio,Desvio Valor,Suporte
1,30735068,RUPTURA DO MANGUITO ROTADOR,PROCEDIMENTO,1.003719,0.078468,598.377991,678.407471,99.992293
4,30735033,OMBRO (VIDEO) - ACROMIOPLASTIA,PROCEDIMENTO,1.003266,0.08203,383.398407,337.677246,70.011561
5,30735084,RESSECCAO LATERAL DA CLAVICULA - PROCEDIMENTO ...,PROCEDIMENTO,1.00317,0.073573,456.098816,375.615631,46.188825
2,30735092,TENOTOMIA PORCAO BICEPES,PROCEDIMENTO,1.003181,0.060839,437.606598,348.944977,29.071291
0,30735017,HONORARIO INSTRUMENTADORA,PROCEDIMENTO,1.008044,0.118644,442.19812,364.971497,14.851638
3,30717140,OMBRO - RESSECCAO PARCIAL OU TOTAL DE CLAVICULA,PROCEDIMENTO,1.001954,0.044165,151.113678,120.551918,11.830443
6,30735041,LESAO LABRAL,PROCEDIMENTO,1.002204,0.046898,714.197937,581.235779,10.489403


### Raio-X final geral:

Passando o resultado para um DataFrame e na ordem do maior para o menor:

In [23]:
lista = selected_df['servico'].unique()
%time final_id,final_serv = eclat_iter(lista,selected_df,min_sup)
final_result = pd.DataFrame(index=final_id,data=final_serv,columns=['Suporte']).sort_values('Suporte',ascending=False)
final_result.Suporte = final_result.Suporte * (100.0/support([master],selected_df))
final_result

CPU times: user 2min 50s, sys: 318 ms, total: 2min 51s
Wall time: 2min 53s


Unnamed: 0,Suporte
30735068,100.000000
"(30735068, 30735033)",70.019268
30735033,70.019268
30735084,46.196532
"(30735068, 30735084)",46.188825
"(30735033, 30735084)",33.132948
"(30735068, 30735033, 30735084)",33.132948
30735092,29.071291
"(30735068, 30735092)",29.071291
"(30735068, 30735033, 30735092)",20.963391


Vamos definir como raio X o grupo que possuir a maior quantidade de serviços que, em conjunto, possuem maior suporte:

In [24]:
def get_len(x):
    if type(x) == tuple:
        return len(x)
    return 1

max_len = max([get_len(x) for x in list(final_result.index)]) # acha o tamanho dos maiores grupos
max_len_groups = [x for x in list(final_result.index) if get_len(x) == max_len] # pega grupos com esse tamanho
xray = list(final_result.loc[max_len_groups].sort_values('Suporte', ascending=False).head(1).index[0]) # pega o com maximo suporte

In [25]:
xray

[30735068, 90008332, 90003551, 90196031]

Agora, com o raio x definido, construimos a dataframe final, com colunas servico, descricao, tipo de item, valor medio e quantidade media.

In [26]:
def get_mean_value(service):
    values = []
    for item in selected_df[selected_df["servico"] == service]["valor"].values:
        try:
            values.append(float(item[1:-1]))
        except ValueError: # some values are wrong, ignore them
            pass
    return sum(values) / len(values)

In [27]:
def get_mean_quantity(service):
    values = []
    for item in selected_df[selected_df["servico"] == service]["qtde"].values:
        try:
            values.append(float(item[1:-1]))
        except ValueError: # some values are wrong, ignore them
            pass
    return sum(values) / len(values)

In [28]:
def get_description(service):
    return selected_df[selected_df["servico"] == service]["descricao_despesa"].values[10]

In [29]:
def get_item_type(service):
    return selected_df[selected_df["servico"] == service]["tipo_item"].values[10]

In [30]:
def get_individual_support(service):
    return final_result.loc[service].values[0]

In [31]:
mean_vals = [selected_df[selected_df["servico"] == service]["valor"].astype(float).mean() for service in xray]
mean_qtt = [selected_df[selected_df["servico"] == service]["qtde"].astype(float).mean() for service in xray]
item_types = [get_item_type(service) for service in xray]
descriptions = [get_description(service) for service in xray]
support_vals = [get_individual_support(service) for service in xray]
xray_dict = {"Servico": xray, "Descricao":descriptions, "Tipo de Item":item_types, "Quantidade média": mean_qtt, 
             "Valor médio": mean_vals, "Suporte":support_vals}
xray_df = pd.DataFrame(xray_dict)

In [32]:
xray_df

Unnamed: 0,Descricao,Quantidade média,Servico,Suporte,Tipo de Item,Valor médio
0,RUPTURA DO MANGUITO ROTADOR,1.002994,30735068,100.0,PROCEDIMENTO,502.800346
1,KEFAZOL,2.864761,90008332,20.947977,MEDICAMENTOS,53.626238
2,DECADRON INJETVEL,1.1417,90003551,17.063584,MEDICAMENTOS,12.588304
3,NOVALGINA; 500MG/ML AMPOLA 2ML INJETAVEL,3.349773,90196031,20.369942,MEDICAMENTOS,8.604235
