# Práctica: Evaluación Minería de Data Streams

- Asignatura: Datos temporales y complejos
- Autor: Mira Abad, Alejandro

In [5]:
import pandas as pd
from collections import Counter
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns.fpgrowth import fpgrowth

## Ejercicio 1

Sea la siguiente secuencia de 50 valores:

`abceebdabaeedbdbabdcecdeabdbacedddacbbeabbcdacbeab`

Aplique el algoritmo de Lossy Counting para epsilón igual a 0,1 hasta b=5 inclusive.

In [6]:
sec = ["a","b","c","e","e","b","d","a","b","a","e","e","d","b","d","b","a","b","d","c","e","c","d","e","a","b","d","b","a","c","e","d","d","d","a","c","b","b","e","a","b","b","c","d","a","c","b","e","a","b"]

epsiolon = 0.1
max_b = 5
w_intervalo = int(1/epsiolon)
n = len(sec)
b = int(n/w_intervalo)

In [7]:
# Convierte la lista a un conjunto para eliminar duplicados
conjunto = set(sec)

# Convierte el conjunto a un diccionario
# Como los diccionarios necesitan pares clave-valor, vamos a usar los elementos de la lista como claves y todos los valores serán None
freq_dict = {clave: [0, 0] for clave in conjunto}

print(freq_dict)

{'b': [0, 0], 'e': [0, 0], 'c': [0, 0], 'd': [0, 0], 'a': [0, 0]}


In [8]:
for i in range(1,max_b+1):
    sub_sec = sec[(i-1)*10:i*10]
    
    for item in sub_sec:
        freq_dict[item][0] += 1
    
    for key in conjunto:
        check = freq_dict[key][0] + freq_dict[key][1]

        if check <= i+1 and freq_dict[key][0] != 0:
            freq_dict[key][0] = 0
            freq_dict[key][1] += 1
    
    print(f"Intervalo {i} - dic: \n {freq_dict} \n")

print(f"Frequent items:")

for key in conjunto:
    if freq_dict[key][0] != 0:
        print(f"{key} - {freq_dict[key][1]}")

Intervalo 1 - dic: 
 {'b': [3, 0], 'e': [0, 1], 'c': [0, 1], 'd': [0, 1], 'a': [3, 0]} 

Intervalo 2 - dic: 
 {'b': [6, 0], 'e': [0, 2], 'c': [0, 2], 'd': [3, 1], 'a': [4, 0]} 

Intervalo 3 - dic: 
 {'b': [8, 0], 'e': [0, 3], 'c': [0, 3], 'd': [5, 1], 'a': [6, 0]} 

Intervalo 4 - dic: 
 {'b': [10, 0], 'e': [0, 4], 'c': [0, 4], 'd': [8, 1], 'a': [8, 0]} 

Intervalo 5 - dic: 
 {'b': [14, 0], 'e': [0, 5], 'c': [0, 5], 'd': [9, 1], 'a': [10, 0]} 

Frequent items:
b - 0
d - 1
a - 0


## Ejercicio 2

Dadas las siguientes 8 “compras” de productos

CDAB
BCA
DA
EBA
CDA
EBC
ABDE
BCE

Construya mediante FP trees los conjuntos frecuentes con soporte mínimo 50% (esto es, de frecuencia 4).

In [9]:
transactions = [["C","D","A","B"],["B","C","A"],["D","A"],["E","B","A"],["C","D","A"],["E","B","C"],["A","B","D","E"],["B","C","E"]]

# Cuenta la frecuencia de cada elemento
counter = Counter(element for transaction in transactions for element in transaction)

# Ordena cada transacción en función de la frecuencia de sus elementos
sorted_transactions = [sorted(transaction, key=lambda x: (-counter[x], x)) for transaction in transactions]

print(sorted_transactions)

[['A', 'B', 'C', 'D'], ['A', 'B', 'C'], ['A', 'D'], ['A', 'B', 'E'], ['A', 'C', 'D'], ['B', 'C', 'E'], ['A', 'B', 'D', 'E'], ['B', 'C', 'E']]


In [10]:
def get_freq_itemsets(transactions, min_support_abs=0):
    # our min support is 7, but it has to be expressed as a percentage for mlxtend
    min_support = min_support_abs/len(transactions) 
    # compute the frequent itemsets using fpgriowth from mlxtend
    frequent_itemsets = fpgrowth(transactions, min_support=min_support, use_colnames = True)
    # all unique support count
    su = frequent_itemsets.support.unique() 

    return frequent_itemsets, su

def convert_list_to_one_hot(list_df):
    # instantiate a transaction encoder
    transactionencoder = TransactionEncoder()
    # fit the transaction encoder using the list of transaction tuples
    transactionencoder.fit(list_df)
    # transform the list of transaction tuples into an array of encoded transactions
    encoded_transactions = transactionencoder.transform(list_df)
    # convert the array of encoded transactions into a dataframe
    encoded_transactions_df = pd.DataFrame(encoded_transactions, columns=transactionencoder.columns_)

    return encoded_transactions_df

In [11]:
encoded_transactions = convert_list_to_one_hot(transactions) # Convert to one hot encoding
freq_itemsets, su = get_freq_itemsets(encoded_transactions, 4) # Get frequent itemsets

print(f"Frequent itmesets: \n------------------------- \n {freq_itemsets} \n")

Frequent itmesets: 
------------------------- 
    support itemsets
0    0.750      (B)
1    0.750      (A)
2    0.625      (C)
3    0.500      (D)
4    0.500      (E)
5    0.500   (A, B)
6    0.500   (C, B)
7    0.500   (D, A)
8    0.500   (E, B) 

