In [1]:
import os
import re
import io
from utils.graph import Grafo

In [2]:
class Email:
    def __init__(self, sender: str, to: list):
        self.sender = sender
        self.to = to

In [3]:
def extract_email_info(email) -> Email:
    """
    This function parses an email file to extract the sender's email address
    and a list of recipient email addresses using regular expressions.
    
    Args:
        email: A file-like object containing the email content
        
    Returns:
        Email: An Email object containing the sender and recipients
            or None if the extraction fails
    """ 
    email_str = email.read()

    # Regex para extrair o remetente do email:
    # From: - Busca o texto "From:"
    # \s* - Seguido de zero ou mais espaços em branco
    # ([^\s]+@[^\s]+) - Grupo de captura para o email:
        #   [^\s]+ - Um ou mais caracteres que não são espaços
        #   @ - O símbolo literal "@"
        #   [^\s]+ - Um ou mais caracteres que não são espaços
    from_pattern = r"From:\s*([^\s]+@[^\s]+)"
    from_match = re.search(from_pattern, email_str)

    #Regex para extrair os destinatários do email:
    # To: - Busca o texto "To:"
    # \s* - Seguido de zero ou mais espaços em branco
    # ((?:[^\s,]+@[^\s,]+)(?:\s*,\s*[^\s,]+@[^\s,]+)*) - Grupo principal que captura todos os emails:
    #   (?:[^\s,]+@[^\s,]+) - Grupo não-capturante para o primeiro email
    #   (?:\s*,\s*[^\s,]+@[^\s,]+)* - Grupo para emails adicionais separados por vírgula
    #      \s*,\s* - Uma vírgula com espaços opcionais antes e depois
    #      [^\s,]+@[^\s,]+ - Padrão de um email adicional
    #      * - Zero ou mais ocorrências deste grupo (emails adicionais)
    to_pattern = r"To:\s*((?:[^\s,]+@[^\s,]+)(?:\s*,\s*[^\s,]+@[^\s,]+)*)"
    to_match = re.search(to_pattern, email_str)

    if from_match and to_match:
        from_email = from_match.group(1)
        to_emails = to_match.group(1).split(',')

        return Email(from_email, to_emails)
    else:
        print(f"Erro extraindo email: {email}")
        
        
        error_type = "No 'From'" if not from_match else "No 'To'" if not to_match else "Unknown error"
        print(f"{error_type} found.")
        
        return None



In [4]:
rootdir = "../database"
emails = []
errors = 0
files_num = 0

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        files_num += 1
        path = os.path.join(subdir, file)
        with open(path, "r") as email:
            email_obj = extract_email_info(email)
            if email_obj:
                emails.append(email_obj)
            else:
                errors += 1
print(f"Total errors: {errors}")
print(f"Total files: {files_num}")

Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/all_documents/1' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/all_documents/4' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/all_documents/57' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/deleted_items/10' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/deleted_items/100' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/deleted_items/105' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/deleted_items/111' mode='r' encoding='UTF-8'>
No 'To' found.
Erro extraindo email: <_io.TextIOWrapper name='../database/brawner-s/deleted_items/112' mode='r'

### Cria o grafo

In [5]:
grafo = Grafo()

for email in emails:
    for receiver in email.to:
        receiver = re.sub('\n|\\\\', '', receiver).strip() # Remover newlines e \ 
        grafo.adiciona_aresta(email.sender, receiver, 1)

### 1. Salvar em arquivo txt  

In [6]:
grafo.imprime_lista_adjacencias()

jons@amerexenergy.com: (sandra.f.brawner@enron.com, 17) -> (sschnitz@jcpenney.com, 3) -> (paulla@maritz.com, 3) -> (robert.k.rodriguez@db.com, 3) -> (lmorris1@pclient.ml.com, 3) -> (gee_ruiz@hotmail.com, 3) -> (ddunnavant@reliantenergy.com, 3) -> (aertel@evomarkets.com, 3) -> (jrschnitzer@hotmail.com, 3) -> (lcjo@dynegy.com, 4) -> (f..brawner@enron.com, 3) -> (jordan.lance@enron.com, 1) -> (hollier.diana@enron.com, 2) -> (dunnavandt.david@enron.com, 1) -> (harry.schnitzer@enron.com, 1) -> (shaw.john@enron.com, 1) -> (davis.jeremy@enron.com, 1) -> (schnitzer.janet@enron.com, 1)
sandra.f.brawner@enron.com: 
enron.announcements@enron.com: (all.houston@enron.com, 3) -> (all.worldwide@enron.com, 52) -> (eligible.employees@enron.com, 12) -> (enron.states@enron.com, 3) -> (enron.mail@enron.com, 3) -> (all.employees@enron.com, 5) -> (houston.report@enron.com, 3) -> (enron.services@enron.com, 3) -> (the.mailout@enron.com, 8) -> (all.states@enron.com, 12) -> (enron.everyone_et&s_omaha@enron.com,

In [7]:
with open("../graph/grafo.txt", mode="w+") as file:
    file.write(grafo.imprime_lista_adjacencias(str_return= True))
    file.close()

In [8]:
grafo.pickle_graph("../graph/graph.pkl")
