In [1]:
import pandas as pd
import numpy as np
import sqlite3
import re
import json
import tldextract

import warnings
warnings.filterwarnings("ignore")

Some Variable Used for the analysis 

In [2]:
domain_name_associated_features = {
    "email-sender-1.example": {"label":"F4", "verbose": "RUA within the organizational domain"},
    "email-sender-2.example": {"label":"F6", "verbose": "RUA outside the organizational domain. EDV passes"},
    "email-sender-3.example": {"label":"F7", "verbose": "RUA outside the organizational domain. EDV fails."},
    "email-sender-4.example": {"label":"F8", "verbose": "RUA outside the organizational domain. EDV passes. Records contain new RUA"}
}

label_to_domain_name = {
    'F4': 'email-sender-1.example', 
    'F6': 'email-sender-2.example', 
    'F7': 'email-sender-3.example', 
    'F8': 'email-sender-4.example',
    'EDV_DOMAIN': 'email-sender-4.example',
    'EDV_RECEIVER' : "email-receiver-overwrite.example"
}


esp_subdomains = ['163com', 'inboxlv', 'control-domainexample', 'rediffcom', 'gmailcom', 'outlookcom', 'tutanotacom', 'fastmailcom', 'mailcom', 'heycom', 'gmxnet', 'yahoocom', 'oppl', 'mailfencecom', 'protonmailcom', 'interiapl', 'kolabnowcom', 'freemailhu', 'seznamcz', 'yandexcom', 'orangefr', 'lapostenet', 'navercom', 'zohomaileu', 'mailru', 'sapopt']



measurements_label = ['F7', 'F6', 'F4', 'F8']

# Extracting data from Rspamd Logs
Rspamd logs contains data about sent and receive email using the Mailcow suite


In [3]:
def extract_feature_from_sender_smtp(smtp_sender)->str:
    domain = ".".join(smtp_sender.split(".")[1:])
    feature = domain_name_associated_features.get(domain, {})
    return feature

def extract_target_esp_from_sender_smtp(smtp_sender)->str:
    return smtp_sender.split("@")[1].split(".")[0]

def extract_spf_alignement_info(smtp_sender)->bool:
    return True if smtp_sender[:8] == "goodrua@" else (False if smtp_sender[:7] == "badrua@" else None)

def extract_dkim_alignement_info(symbols)->bool:
    rspamd = symbols.get("DKIM_SIGNED", {}).get("options", None)
    return None if rspamd is None else not "dkimfalse" in rspamd[0]

def extract_fqdn_sendto(email_address:str)->str:
    fqdn = tldextract.extract(email_address).fqdn 
    
    # tldextract does not recognize .example TLD as defined in RFC 2606
    # https://www.rfc-editor.org/rfc/rfc2606.html#section-2
    if fqdn == "" and email_address == "testESP@control-domain.example":
        fqdn = "control-domain.example"
        
    return fqdn

with open("datasets/Rspamd/logs.json", 'r') as fp:
    df = pd.DataFrame([json.loads(line) for line in fp.readlines()])
    
df["aligned_spf"] = df['sender_smtp'].apply(lambda x:  extract_spf_alignement_info(x))
df["aligned_dkim"] = df["symbols"].apply(lambda x: extract_dkim_alignement_info(x))

df = df[df["aligned_spf"].notna()]
df["measurement_features_label"] = df['sender_smtp'].apply(lambda x:extract_feature_from_sender_smtp(x).get("label"))
df["measurement_features_verbose"] = df['sender_smtp'].apply(lambda x:extract_feature_from_sender_smtp(x).get("verbose"))
df["targeted_esp"] = df['sender_smtp'].apply(lambda x:extract_target_esp_from_sender_smtp(x))
df["email_sent_to"] = df['rcpt_smtp'].apply(lambda x:  extract_fqdn_sendto(x[0]) )


# Generad .csv file from the dataset for Table creation
# Removed the test domain for results 
df.to_csv("generated/rspamd.csv", index=False)


df.head()

Unnamed: 0,unix_time,message-id,time_real,sender_smtp,sender_mime,rcpt_smtp,rcpt_mime,action,ip,symbols,...,user,size,is_skipped,required_score,aligned_spf,aligned_dkim,measurement_features_label,measurement_features_verbose,targeted_esp,email_sent_to
12,1695652404,169565240353.19703.10350902303625113444@zohoma...,0.06768,badrua@zohomaileu.email-sender-3.example,badrua@zohomaileu.email-sender-3.example,[anonymized_user@zohomail.eu],[anonymized_user@zohomail.eu],no action,2001:660:5301:24:2be:43ff:fe2b:1978,"{'TO_MATCH_ENVRCPT_ALL': {'metric_score': 0, '...",...,anonymized_user@email-sender-3.example,2051,False,15,False,False,F7,RUA outside the organizational domain. EDV fails.,zohomaileu,zohomail.eu
13,1695652403,169565240294.19703.4884400885574083640@yandexc...,0.119373,badrua@yandexcom.email-sender-3.example,badrua@yandexcom.email-sender-3.example,[anonymized_user@yandex.com],[anonymized_user@yandex.com],no action,2001:660:5301:24:2be:43ff:fe2b:1978,"{'TO_MATCH_ENVRCPT_ALL': {'metric_score': 0, '...",...,anonymized_user@email-sender-3.example,1814,False,15,False,False,F7,RUA outside the organizational domain. EDV fails.,yandexcom,yandex.com
14,1695652402,169565240223.19703.11347189817033946450@yahooc...,0.081694,badrua@yahoocom.email-sender-3.example,badrua@yahoocom.email-sender-3.example,[anonymized_user@yahoo.com],[anonymized_user@yahoo.com],no action,2001:660:5301:24:2be:43ff:fe2b:1978,"{'TO_MATCH_ENVRCPT_ALL': {'metric_score': 0, '...",...,anonymized_user@email-sender-3.example,2342,False,15,False,False,F7,RUA outside the organizational domain. EDV fails.,yahoocom,yahoo.com
15,1695652402,169565240171.19703.12718450990756192606@seznam...,0.073125,badrua@seznamcz.email-sender-3.example,badrua@seznamcz.email-sender-3.example,[anonymized_user@seznam.cz],[anonymized_user@seznam.cz],no action,2001:660:5301:24:2be:43ff:fe2b:1978,"{'TO_MATCH_ENVRCPT_ALL': {'metric_score': 0, '...",...,anonymized_user@email-sender-3.example,1915,False,15,False,False,F7,RUA outside the organizational domain. EDV fails.,seznamcz,seznam.cz
16,1695652401,169565240111.19703.12802847786866253162@sapopt...,0.064959,badrua@sapopt.email-sender-3.example,badrua@sapopt.email-sender-3.example,[anonymized_user@sapo.pt],[anonymized_user@sapo.pt],no action,2001:660:5301:24:2be:43ff:fe2b:1978,"{'TO_MATCH_ENVRCPT_ALL': {'metric_score': 0, '...",...,anonymized_user@email-sender-3.example,1745,False,15,False,False,F7,RUA outside the organizational domain. EDV fails.,sapopt,sapo.pt


# Extracting data from the Bind log file.
The file contains the domain names and resource record querried to our authoritative nameserver.
We first extract the data from this file and create an .sqlite3 database storing the following data : "TIME", "IP_SOURCE", "PORT_SOURCE", "query", "TYPE"
Using the sqlite database we can build the 'generated/bind_results.csv' file containing the informations for the .tex table generation.



In [4]:
def create_and_populate_bind_database():

    queries = []
    
    # Define a regular expression pattern to match BIND9 query log entries
    #log_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) .*?client ([\d.]+)#\d+: query: ([\w.-]+) (\w+)'
    log_pattern = '([0-9]{1,2}-[A-z]{1,3}-[0-9]{4} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}).*client @[^ ]+ ([^# ]+)#([0-9]+) .*query: (.*) (IN|in) ([^ ]+)'
    
    # Sample BIND9 query log entry
    
    # Use the regular expression pattern to parse the log entry
    with open("datasets/DNS/query.log", "r") as fp:
        for line in fp.readlines():
            match = re.match(log_pattern, line.lower())
            try:
                timestamp = match.group(1)
                ip_source = match.group(2)
                port_source = match.group(3)
                query = match.group(4)
                type = match.group(6).upper()
                npa = np.array([timestamp, ip_source, port_source, query, type])
                queries.append(npa)
                
            except Exception as e:
                ok = False
                for value_to_ignore in ["hostname.bind", "version.bind", "id.server", "www.stage"]:
                    if value_to_ignore in line.lower():
                        ok = True
                if ok:
                    continue
                raise e
            
    df = pd.DataFrame(np.array(queries), columns=["TIME", "IP_SOURCE", "PORT_SOURCE", "query", "TYPE"])
    
    
    
    conn = sqlite3.connect('generated/bind.sqlite')
    df.to_sql('results', conn, if_exists='replace', index=False)
    conn.close()
    
    return

def is_there_spf_exist(domain):
    conn = sqlite3.connect('generated/bind.sqlite')
    cur = conn.cursor()
    cur.execute(f'select distinct query, TYPE from results where query =  "goodrua._spf.{domain}.{label_to_domain_name["F4"].lower()}";')
    res = cur.fetchone() is not None
    conn.close()
    return res

def is_there_spf_TXT(domain):
    conn = sqlite3.connect('generated/bind.sqlite')
    cur = conn.cursor()
    cur.execute(f'select * from results where query = "{domain}.{label_to_domain_name["F4"].lower()}" and TYPE = "TXT"')
    res = cur.fetchone() is not None
    conn.close()
    return res

def is_there_DKIM(domain):
    conn = sqlite3.connect('generated/bind.sqlite')
    cur = conn.cursor()
    cur.execute(f' select distinct query, TYPE from results where query =  "dkim._domainkey.{domain}.{label_to_domain_name["F4"].lower()}" and TYPE = "TXT";')
    res = cur.fetchone() is not None
    conn.close()
    return res

def is_there_DMARC(domain):
    conn = sqlite3.connect('generated/bind.sqlite')
    cur = conn.cursor()
    cur.execute(f'select * from results where query = "_dmarc.{domain}.{label_to_domain_name["F4"].lower()}" and TYPE = "TXT"')
    res = cur.fetchone() is not None
    conn.close()
    return res

def is_there_external_verif(domain):
    conn = sqlite3.connect('generated/bind.sqlite')
    cur = conn.cursor()
    cur.execute(f'select * from results where query = "{domain}.{label_to_domain_name["EDV_DOMAIN"].lower()}._report._dmarc.{label_to_domain_name["EDV_RECEIVER"].lower()}"')
    res = cur.fetchone() is not None
    conn.close()
    return res

def bind_results_for_esp(esp):
    return {
        "ESP":esp,
        "SPF_txt": is_there_spf_TXT(esp),
        "SPF_exsit": is_there_spf_exist(esp),
        "DKIM_txt": is_there_DKIM(esp),
        "DMARC_txt": is_there_DMARC(esp),
        "EDV_process":is_there_external_verif(esp)  
    }


create_and_populate_bind_database()
df = pd.DataFrame([bind_results_for_esp(esp) for esp in esp_subdomains ])
df.to_csv("generated/bind_results.csv", index=False)

df
    

Unnamed: 0,ESP,SPF_txt,SPF_exsit,DKIM_txt,DMARC_txt,EDV_process
0,163com,True,False,True,True,False
1,inboxlv,True,True,True,True,False
2,control-domainexample,True,True,True,True,False
3,rediffcom,True,True,True,True,False
4,gmailcom,True,True,True,True,False
5,outlookcom,True,True,True,True,False
6,tutanotacom,True,True,True,True,False
7,fastmailcom,True,True,True,True,True
8,mailcom,True,True,True,True,False
9,heycom,True,True,True,True,False


# Extract Information from Email Received in ESPs' mailbox

This cell extract the authentication information from all the emails we have retrieved in the ESP's registerd account inbox.

In [5]:
from email import policy
from email.parser import BytesParser
import json
import glob
from authres import AuthenticationResultsHeader


    
def extract_email_info(email_path, esp):
    with open(email_path, 'rb') as f:
        email_rb = f.read()

    msg = BytesParser(policy=policy.default).parsebytes(email_rb)
    auth_info = set()
  
    for header_type in ["Authentication-Results", "X-KLMS-AntiSpam-Auth", "X-DMARC-Verification"]:   
        for header in [x[1] for x in msg.items() if x[0] == header_type]:
            full_headers = f'{header_type}:{header}'
            auth_method = []
            if header_type == "Authentication-Results":     
                    # Microsoft is not respecting the RFC8601. The authserv-id is not provided
                    # The issue has allready been reported 
                    # https://answers.microsoft.com/en-us/outlook_com/forum/all/authentication-results-header-written-by-outlook/890b304c-3c81-48b6-b065-36fad3b551e4                   
                    if esp == "outlookcom": 
                        auth_method = ['spf', 'dmarc', 'dkim']       
                    # Yahoo is not respecting the RFC8601 as there is an additional semi-column character at the end of the headers. Which is not allowed.
                    # Thus, we remove the semi-column at the end
                    elif esp == "yahoocom":
                        parsed_auht_result = AuthenticationResultsHeader.parse(full_headers[:-1])
                        auth_method = [result.method for result in parsed_auht_result.results]
                    # 163 is not respecting the RFC8601 as the return 
                    # Thus, we remove the semi-column at the end
                    elif esp == "163com":
                        
                        full_headers = full_headers.replace("\n", "").replace("\t", "")
                        auth_method = auth_info.union(set([result.method for result in AuthenticationResultsHeader.parse(full_headers).results]))
                    else:                        
                        parsed_auth_result = AuthenticationResultsHeader.parse(full_headers)
                        auth_method = [result.method for result in parsed_auth_result.results]
  
            elif header_type == "Received-SPF":
                auth_method.append('spf')                
            # inbox.lv does not have an Authentication-Results headers but 
            # a "X-KLMS-AntiSpam-Auth" that is very similar.
            # We decided to add it manualy 
            # Same for rediff.com ! They do not have an Authentication-Results header
            # but an X-DMARC-Verification:            
            elif header_type == "X-KLMS-AntiSpam-Auth" or header_type == "X-Spam-Status" or header_type == "X-DMARC-Verification":               
                for auth_protocol in ["spf", "dkim", "dmarc"]:
                    if auth_protocol in header:
                        auth_method.append(auth_protocol)
            auth_info = auth_info.union(set(auth_method))            
            
    return (esp, list(auth_info), email_path)
    
result = [] 


for esp_folders in glob.glob("datasets/EmailReceived/*"):
    esp = esp_folders.split("/")[-1]       
    for email_path in glob.glob(f'{esp_folders}/*.eml'):
        result.append(extract_email_info(email_path, esp))

# Store all the information gathered
with open("generated/parsed_email_in_esp_inbox.json", 'w') as fp:
    json.dump(result, fp)
    
# Produce the data for the Table Generation.
# For each ESP, which informations are provided in the Authentication-Results header

esp_auth_resuts_info = {}

for r in result:
    esp = r[0]
    if esp not in esp_auth_resuts_info:
        esp_auth_resuts_info[esp] = set(r[1])
    diff_set = esp_auth_resuts_info[esp].difference(set(r[1]))
 
    
for k in esp_auth_resuts_info.keys():
    esp_auth_resuts_info[k] = list(esp_auth_resuts_info[k])
print(esp_auth_resuts_info)

with open("generated/auth_info.json", 'w') as fp :
    json.dump(esp_auth_resuts_info, fp)

{'lapostenet': ['bimi', 'dmarc', 'dkim', 'spf', 'arc'], 'tutanotacom': ['dmarc', 'dkim'], 'mailcom': ['dkim'], 'rediffcom': ['dmarc', 'dkim', 'spf'], 'heycom': ['dmarc', 'dkim', 'spf'], 'gmxnet': ['dkim'], 'mailfencecom': [], 'orangefr': ['dkim', 'arc'], 'freemailhu': [], 'kolabnowcom': ['dkim'], 'gmailcom': ['dmarc', 'dkim', 'spf'], 'yandexcom': ['dkim', 'spf'], 'navercom': ['dmarc', 'dkim', 'spf'], 'sapopt': [], 'seznamcz': [], 'inboxlv': ['dmarc', 'dkim', 'spf'], 'oppl': ['bimi', 'x-ptr', 'x-tls', 'iprev', 'dmarc', 'dkim', 'spf'], 'outlookcom': ['dmarc', 'dkim', 'spf'], 'mailru': ['dmarc', 'dkim', 'spf'], 'control-domainexample': ['dmarc', 'dkim', 'spf'], 'protonmailcom': ['arc', 'dmarc', 'dkim', 'spf'], 'yahoocom': ['dmarc', 'dkim', 'spf'], 'zohomaileu': ['dmarc', 'dkim', 'spf'], '163com': ['dkim', 'spf'], 'fastmailcom': ['x-csa', 'bimi', 'x-ptr', 'iprev', 'arc', 'dmarc', 'dkim', 'spf', 'x-me-sender'], 'interiapl': []}


# Extract information for DMARC aggregate report received

In [6]:
import re 
from email import policy
from email.parser import BytesParser

regex_rule = re.compile("domain:([\s]*[^\s]+).*submitter:([\s]*[^\s]+).*Report-ID:([\s]*[^\s]+)", re.IGNORECASE) 

def get_dmarc_report_subject_info(subject):
    # Lowercase the subject 
    email_subject = subject.lower()
    # remove the following characters: [{, }, ;],
    email_subject = email_subject.replace("{", "").replace("}", "").replace(";", "")
    # replace the tab character (\t) by a whitespace
    email_subject = email_subject.replace("\t", " ")
    ''' use the following regular expression: : https://regex101.com/r/2VDIyo/2
    domain:([\s]*[^\s]+).*submitter:([\s]*[^\s]+).*  report-id:([\s]*[^\s]+) '''
    
    res = regex_rule.findall(email_subject)
    
    return [{"domain":match[0].strip(), "submitter": match[1].strip(), "report-id":match[2].strip()}  for match in res]

# Return the measured ESP If submitter is within an organization 
# and the submitter domain name does not correspond to the measured ESP
# e.g: gmail.com report as 'google.com' submitter
def esp_name_from_submitter_info(sumbitter):
    return {"protection.outlook.com": 'outlook.com', 'google.com': 'gmail.com'}.get(sumbitter, sumbitter)


def extract_informations_from_email(email_path):
    with open(email_path, 'rb') as fp:
        email_rb = fp.read()
        msg = BytesParser(policy=policy.default).parsebytes(email_rb)
        email_subject = msg.get("subject")
        parsed_dmarc_report_info = get_dmarc_report_subject_info(email_subject)[0]
        msg_to = msg.get("to")
        msg_from = msg.get("from")
    
        return {"subject": email_subject, 
                    "parsed_domain": parsed_dmarc_report_info["domain"],
                    "parsed_submitter": parsed_dmarc_report_info["submitter"],
                    "parsed_report_id":parsed_dmarc_report_info["report-id"],
                    "msg_to":msg_to, 
                    "msg_from":msg_from,
                    "feature_domain":features.split("@")[-1],
                    "local_part":msg_to.split("@")[0]}
    
res = []
directories = []

for dir in glob.glob("datasets/ReportReceived/*"):
    directories.append({"path":dir, "domain": dir.split("@")[-1]})

for features in [x["path"] for x in directories]:
    for file in glob.glob(f'{features}/*') :         
        res.append(extract_informations_from_email(file))

df = pd.DataFrame(res)
df["esp_label"] = df["parsed_submitter"].apply(lambda x: esp_name_from_submitter_info(x) )
df.to_csv('generated/report_received.csv', index=False)

df


Unnamed: 0,subject,parsed_domain,parsed_submitter,parsed_report_id,msg_to,msg_from,feature_domain,local_part,esp_label
0,Report Domain: seznamcz.email-sender-3.example...,seznamcz.email-sender-3.example,seznam.cz,szn_seznamcz.email-sender-3.example-2023-09-25,anonymized_user@email-recevier-invalid-edv.exa...,"""Seznam.cz"" <abuse@seznam.cz>",email-recevier-invalid-edv.example,anonymized_user,seznam.cz
1,Report domain: control-domainexample.email-sen...,control-domainexample.email-sender-3.example,google.com,12320246433988280711,anonymized_user@email-recevier-invalid-edv.exa...,noreply-dmarc-support@google.com,email-recevier-invalid-edv.example,anonymized_user,gmail.com
2,Report domain: gmailcom.email-sender-3.example...,gmailcom.email-sender-3.example,google.com,13365075012726356696,anonymized_user@email-recevier-invalid-edv.exa...,noreply-dmarc-support@google.com,email-recevier-invalid-edv.example,anonymized_user,gmail.com
3,Report Domain: outlookcom.email-sender-3.examp...,outlookcom.email-sender-3.example,protection.outlook.com,6cf08044894b4373817d086a63a527b8,anonymized_user@email-recevier-invalid-edv.exa...,DMARC Aggregate Report <dmarcreport@microsoft....,email-recevier-invalid-edv.example,anonymized_user,outlook.com
4,Report Domain: yahoocom.email-sender-2.example...,yahoocom.email-sender-2.example,yahoo.com,<1695691562.122520>,rua@control-domain.example,noreply@dmarc.yahoo.com,email-recevier-invalid-edv.example,rua,yahoo.com
5,Report Domain: outlookcom.email-sender-1.examp...,outlookcom.email-sender-1.example,protection.outlook.com,99c8f8ef55e648718924195f0f2f3b41,rua@outlookcom.email-sender-1.example,DMARC Aggregate Report <dmarcreport@microsoft....,email-sender-1.example,rua,outlook.com
6,Report domain: gmailcom.email-sender-1.example...,gmailcom.email-sender-1.example,google.com,10338441280247014226,rua@gmailcom.email-sender-1.example,noreply-dmarc-support@google.com,email-sender-1.example,rua,gmail.com
7,Report Domain: seznamcz.email-sender-1.example...,seznamcz.email-sender-1.example,seznam.cz,szn_seznamcz.email-sender-1.example-2023-09-25,rua@seznamcz.email-sender-1.example,"""Seznam.cz"" <abuse@seznam.cz>",email-sender-1.example,rua,seznam.cz
8,Report Domain: fastmailcom.email-sender-1.exam...,fastmailcom.email-sender-1.example,fastmail.com,2023.09.25.983356728,rua@fastmailcom.email-sender-1.example,reports@fastmaildmarc.com,email-sender-1.example,rua,fastmail.com
9,Report Domain: mailru.email-sender-1.example; ...,mailru.email-sender-1.example,mail.ru,17381045686284013781695600000,rua@mailru.email-sender-1.example,dmarc_support@corp.mail.ru,email-sender-1.example,rua,mail.ru


# Generate the result analysis from DMARC aggregate repot data extracted previously

In [7]:
are_sending_rua = df[df.feature_domain == "email-sender-1.example"]["esp_label"][df["local_part"] == "rua"].drop_duplicates()
are_sending_ruf = df[df.feature_domain == "email-sender-1.example"]["esp_label"][df["local_part"] == "ruf"].drop_duplicates()
have_received_rua_when_edv_fails = df[df.feature_domain == "email-recevier-invalid-edv.example"]["esp_label"].drop_duplicates()
have_received_rua_external_edv_suceed = df[df.feature_domain == "control-domain.example"]["esp_label"].drop_duplicates()
have_received_rua_external_edv_overwrite= df[df.feature_domain == "email-receiver-overwrite.example"][df.msg_to == "redirectionRUA@email-receiver-overwrite.example"]["esp_label"].drop_duplicates()

features_evaluation = {"esp_sending_rua": list(are_sending_rua),
                    "esp_sending_ruf": list(are_sending_ruf),
       "esp_sending_rua_edv_succeed": list(have_received_rua_external_edv_suceed),
       "esp_sending_rua_edv_failed": list(have_received_rua_when_edv_fails),
       "edv_rewritting_rua": list(have_received_rua_external_edv_overwrite)
       }

print(features_evaluation)

with open("generated/reporting_feature", 'w') as fp:
    json.dump(features_evaluation, fp)




{'esp_sending_rua': ['outlook.com', 'gmail.com', 'seznam.cz', 'fastmail.com', 'mail.ru', 'yahoo.com'], 'esp_sending_ruf': [], 'esp_sending_rua_edv_succeed': ['gmail.com', 'yahoo.com', 'fastmail.com', 'seznam.cz', 'mail.ru', 'outlook.com'], 'esp_sending_rua_edv_failed': ['seznam.cz', 'gmail.com', 'outlook.com', 'yahoo.com'], 'edv_rewritting_rua': ['fastmail.com']}
