In [None]:
import csv
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Set

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [52]:
# Mostrar el contenido completo de las columnas
pd.set_option('display.max_colwidth', None)  # Sin límite de caracteres
pd.set_option('display.width', None)        # Sin límite de ancho total
pd.set_option('display.max_columns', None)

In [None]:
def collect_entity_names(nodes: List[Dict]) -> Set[str]:
    """Return a set with the *unique* names of all entities."""
    return {n["name"] for n in nodes if n.get("type") == "Entity"}

def extract_time_slot(raw_ts: str) -> str:
    if not raw_ts:
        return "unknown"
    try:
        return datetime.fromisoformat(raw_ts).date().isoformat()
    except ValueError:
        return raw_ts
    
def detect_participants(content: str, entity_names: Set[str]) -> Set[str]:
    participants = {name for name in entity_names if name in content}
    return participants

In [None]:
data = load_json('../data/MC3_graph.json')
schema = load_json('../data/MC3_schema.json')

In [None]:
data

In [None]:
nodes_type = schema['schema']['nodes'].keys()

In [None]:
nodes_type

In [None]:
nodes = data.get("nodes", [])
entity_names = collect_entity_names(nodes)

In [None]:
entity_names

In [None]:
############
import pandas as pd

In [None]:
data = pd.read_csv("../data/MC3_data_parsed.csv")

In [None]:
data[data["edge_id"]=="Event_Communication_2"]

In [None]:
data[data["edge_id"]=="Event_Communication_1"]

In [None]:
(data["edge_name_description"]).unique()

In [None]:
sentences = list(data["edge_name_description"].unique())


In [None]:
sentences

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 2.1 Vectorizar
vectorizer = TfidfVectorizer(stop_words="english")  # o usa español: stop_words="spanish"
tfidf = vectorizer.fit_transform(sentences)

# 2.2 Matriz de similitud coseno
sim_matrix = cosine_similarity(tfidf)

# 2.3 Encontrar los pares más similares
#  - ignoramos la diagonal (sim(self)=1)
#  - “desempaquetamos” la parte superior de la matriz
idxs, jdxs = np.triu_indices_from(sim_matrix, k=1)
pairs = list(zip(idxs, jdxs, sim_matrix[idxs, jdxs]))

# 2.4 Ordenar y mostrar top-10
top10 = sorted(pairs, key=lambda x: x[2], reverse=True)[:10]
for i, j, score in top10:
    print(f"{score:.3f} →\n 1) {sentences[i]}\n 2) {sentences[j]}\n")


In [None]:
!pip install scikit-learn


In [None]:
pd.set_option('display.max_colwidth', None)


In [None]:
print(sentences[i].endswith("…"))

In [None]:
import re

# 1) Tokenización muy básica
def tokenize(s):
    return set(re.findall(r'\w+', s.lower()))

sentences = list(data["edge_name_description"].unique())
token_sets = [tokenize(s) for s in sentences]

# 2) Construir matriz de similitud Jaccard
n = len(sentences)
scores = []
for i in range(n):
    for j in range(i+1, n):
        A, B = token_sets[i], token_sets[j]
        if not A or not B:
            continue
        jacc = len(A & B) / len(A | B)
        scores.append((i, j, jacc))

# 3) Top-10 pares más similares
top10 = sorted(scores, key=lambda x: x[2], reverse=True)[:10]
for i, j, sc in top10:
    print(f"{sc:.3f} →\n 1) {sentences[i]}\n 2) {sentences[j]}\n")


In [None]:
top20 = sorted(scores, key=lambda x: x[2], reverse=True)[:20]

for i, j, sc in top20:
    print(f"{sc:.3f} →\n 1) {sentences[i]}\n 2) {sentences[j]}\n")

In [None]:
import json
from pathlib import Path
from typing import List, Dict, Any

def load_data(path: str) -> List[Dict[str,Any]]:
    """
    Carga un fichero .json (lista de objetos) o .jsonl/.ndjson (un objeto JSON por línea).
    Devuelve una lista de diccionarios.
    """
    p = Path(path)
    if p.suffix.lower() == ".json":
        # JSON estándar: [ {...}, {...}, ... ]
        with p.open("r", encoding="utf-8") as f:
            data = json.load(f)
    elif p.suffix.lower() in (".jsonl", ".ndjson"):
        # JSONL/NDJSON: un objeto JSON por línea
        data = []
        with p.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                data.append(json.loads(line))
    else:
        raise ValueError(f"Formato no soportado: {p.suffix}")
    return data

if __name__ == "__main__":
    # 1) Ruta a tu fichero JSON o JSONL
    archivo = '../data/MC3_graph.json'
    
    # 2) Carga los registros
    registros = load_data(archivo)
    
    # 3) Extrae la lista de oraciones completas
    sentences = [
        rec.get("edge_name_description", "")
        for rec in registros
        if "edge_name_description" in rec
    ]
    
    # 4) Comprueba que ya no tienen el carácter de truncado “…”
    for s in sentences[:5]:
        print(s)


In [None]:
sentences

In [None]:
import json
from pathlib import Path

def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

if __name__ == "__main__":
    archivo = Path("../data/MC3_graph.json")
    graph = load_json(archivo)

    # 1) ¿Qué tipo de objeto es?
    print(type(graph))           # dict, list, etc.

    # 2) Si es dict, qué claves tiene?
    if isinstance(graph, dict):
        print("Claves en graph:", graph.keys())

    # 3) Inspecciona un fragmento pequeño
    #    Ajusta según veas el nombre correcto de la lista de aristas/edges
    sample = None
    if isinstance(graph, dict) and "edges" in graph:
        sample = graph["edges"][:3]
    elif isinstance(graph, dict) and "links" in graph:
        sample = graph["links"][:3]
    elif isinstance(graph, list):
        sample = graph[:3]
    print("Muestra de registros:", sample)


In [None]:
registros

In [None]:
# 1) extraer todas las aristas de tipo Communication
communications = [
    edge["content"]
    for edge in graph.get("edges", [])
    if edge.get("sub_type") == "Communication" and "content" in edge
]

# 2) imprimirlas (o devolverlas, usarlas como necesites)
for idx, text in enumerate(communications, 1):
    print(f"{idx:03d}: {text}")


In [None]:
for edge in graph["data"]["links"]:
    if edge.get("sub_type") == "Communication":
        print(edge["content"])

In [None]:
import json
from pathlib import Path
from typing import Any, Dict, List, Union

def load_json(path: Path) -> Union[Dict[str, Any], List[Any]]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def main():
    archivo = Path("../data/MC3_graph.json")
    graph = load_json(archivo)

    # 1) ¿Qué estructura tiene 'graph'?
    print("Tipo de graph:", type(graph))
    if isinstance(graph, dict):
        print("Claves en root:", list(graph.keys()))
    print()

    # 2) ¿Dónde están tus aristas?
    #    Prueba varias rutas posibles:
    for candidate in ("edges", "links", "graph"):
        if isinstance(graph, dict) and candidate in graph:
            print(f"Encontré lista '{candidate}' con {len(graph[candidate])} elementos")
    print()

    # 3) Supongamos que las aristas están en graph["edges"]
    #    Cámbialo si está en otra ruta (p.ej. graph["graph"]["edges"])
    records = []
    if isinstance(graph, dict) and "edges" in graph:
        records = graph["edges"]
    elif isinstance(graph, dict) and "graph" in graph and isinstance(graph["graph"], dict):
        # a veces está anidado
        recs = graph["graph"]
        if "edges" in recs:
            records = recs["edges"]

    print("Número total de registros que vamos a filtrar:", len(records))

    # 4) Filtrar los Communication
    communications = [
        rec["content"]
        for rec in records
        if rec.get("sub_type") == "Communication" and "content" in rec
    ]

    print("Comunications encontradas:", len(communications))
    for idx, text in enumerate(communications, 1):
        print(f"{idx:03d}: {text}")

if __name__ == "__main__":
    main()



In [None]:
import json
from pathlib import Path
from typing import Any, Dict, List, Union

def load_json(path: Path) -> Union[Dict[str, Any], List[Any]]:
    """Carga un JSON estándar y lo devuelve como dict o lista."""
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

#def main():
# 1) Ruta al .json
archivo = Path("../data/MC3_graph.json")
graph = load_json(archivo)

# 2) Chequeo rápido de estructura
print("Claves raíz:", list(graph.keys()))
print("→ #nodes:", len(graph.get("nodes", [])))
print("→ #edges:", len(graph.get("edges", [])))
print()

# 3) Extraemos de `nodes` porque ahí están tus eventos
eventos = graph.get("nodes", [])

# 4) Filtrar Communications y Monitoring
communications = [
    ev["content"]
    for ev in eventos
    if ev.get("sub_type") == "Communication" and "content" in ev
    #if ev.get("sub_type") == "Communication"  in ev
]
monitorings = [
    ev["findings"]
    for ev in eventos
    if ev.get("sub_type") == "Monitoring" and "findings" in ev
    #if ev.get("sub_type") == "Monitoring"  in ev
]

# 5) Mostrar resultados
print(f"Encontré {len(communications)} comunicaciones:")
for idx, txt in enumerate(communications, 1):
    print(f"{idx:03d}: {txt}")
print()
print(f"Encontré {len(monitorings)} informes de Monitoring:")
for idx, txt in enumerate(monitorings, 1):
    print(f"{idx:03d}: {txt}")

# if __name__ == "__main__":
#     main()


In [None]:
communications

In [None]:
len(communications)

In [None]:
#!pip install sentence-transformers
import sys
!{sys.executable} -m pip install sentence-transformers --quiet


In [None]:
from sentence_transformers import SentenceTransformer, util

model      = SentenceTransformer('all-MiniLM-L6-v2')
emb        = model.encode(communications, convert_to_tensor=True)
sim_matrix = util.cos_sim(emb, emb) 

import torch
n = sim_matrix.size(0)
sim_np = sim_matrix.cpu().numpy()
idxs, jdxs = torch.triu_indices(n, n, offset=1)
scores = [(i, j, sim_np[i, j]) for i, j in zip(idxs, jdxs)]
top10  = sorted(scores, key=lambda x: x[2], reverse=True)[:10]
for i, j, sc in top10:
    print(f"{sc:.3f}: {communications[i]} ↔ {communications[j]}")


In [5]:
import pandas as pd
pd.set_option('display.max_columns', None)       # Mostrar todas las columnas
pd.set_option('display.max_colwidth', None)      # Sin límite de ancho de columna
pd.set_option('display.width', None)             # Sin límite de ancho total
pd.set_option('display.max_rows', None)   

In [None]:
top10  = sorted(scores, key=lambda x: x[2], reverse=True)[:20]
for i, j, sc in top10:
    print(f"{sc:.3f}: {communications[i]} ↔ {communications[j]}")

In [None]:
df_similitudes = pd.DataFrame({
    'score': [sc for i, j, sc in top10],
    'mensaje_1': [communications[i] for i, j, sc in top10],
    'mensaje_2': [communications[j] for i, j, sc in top10]
})



In [None]:
df_similitudes

In [None]:
import json
import pandas as pd
file_path = r'C:\Users\vdela\Documents\vast-challenge3-2025\data\MC3_graph.json'
# Cargar el JSON
with open(file_path, 'r', encoding='utf-8') as f: #C:\Users\vdela\Documents\vast-challenge3-2025\data\MC3_graph.json
    data = json.load(f) #data\MC3_graph.json

# Extraer nodos
nodes = data.get('nodes', [])
df = pd.DataFrame(nodes)

# Filtrar entidades y eventos
entities = df[df['type'] == 'Entity']
events = df[df['type'] == 'Event']

# Contar subtipos
entity_counts = entities['sub_type'].value_counts()
event_counts = events['sub_type'].value_counts()

# Construir resumen
summary = {
    'Total nodos': len(df),
    'Total entidades': len(entities),
    'Total eventos': len(events),
}

# Añadir conteos por subtipo
for sub_type, count in entity_counts.items():
    summary[f'Entidades: {sub_type}'] = count
for sub_type, count in event_counts.items():
    summary[f'Eventos: {sub_type}'] = count

# Crear DataFrame de resumen
summary_df = pd.DataFrame.from_dict(summary, orient='index', columns=['Conteo'])


In [None]:
summary_df

In [None]:
import json
import pandas as pd
file_path = r'C:\Users\vdela\Documents\vast-challenge3-2025\data\MC3_graph.json'
# Cargar el JSON
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Construye un mapa de nodos y lista de edges
nodes = data['nodes']
edges = data['edges']
node_map = {n['id']: n for n in nodes}

# 3. Filtra sólo los eventos
event_nodes = [n for n in nodes if n['type'] == 'Event']

# 4. Para cada evento, extrae timestamp, tipo, texto, fuentes y objetivos
rows = []
for ev in event_nodes:
    ev_id = ev['id']
    timestamp = ev.get('timestamp')
    etype = ev.get('label', ev.get('sub_type', ''))
    # El contenido puede estar en distintas claves
    text = ev.get('content') or ev.get('text') or ev.get('findings', '')

    # Fuentes: edges cuyo target es el evento (tipo 'sent' o cualquier incoming)
    src_edges = [e for e in edges if e.get('target') == ev_id and e.get('type') == 'sent']
    if not src_edges:
        src_edges = [e for e in edges if e.get('target') == ev_id]
    sources = [
        f"{node_map[e['source']]['label']} ({node_map[e['source']]['sub_type']})"
        for e in src_edges if e['source'] in node_map
    ]

    # Objetivos: edges cuyo source es el evento (tipo 'received' o cualquier outgoing a entidad)
    tgt_edges = [e for e in edges if e.get('source') == ev_id and e.get('type') == 'received']
    if not tgt_edges:
        tgt_edges = [
            e for e in edges
            if e.get('source') == ev_id and node_map.get(e['target'], {}).get('type') == 'Entity'
        ]
    targets = [
        f"{node_map[e['target']]['label']} ({node_map[e['target']]['sub_type']})"
        for e in tgt_edges if e['target'] in node_map
    ]

    rows.append({
        'Date & Time': timestamp,
        'Type': etype,
        'Text': text,
        'Sources': ', '.join(sources),
        'Targets': ', '.join(targets)
    })

# 5. Monta el DataFrame y ordénalo por fecha
df_events = pd.DataFrame(rows)
df_events['Date & Time'] = pd.to_datetime(df_events['Date & Time'])
df_events = df_events.sort_values('Date & Time')


In [None]:
import json
import pandas as pd

file_path = r'C:\Users\vdela\Documents\vast-challenge3-2025\data\MC3_graph.json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

nodes = data['nodes']
edges = data['edges']
node_map = {n['id']: n for n in nodes}

# 1) Crea un diccionario para evidence_for: { source_event_id: target_event_id }
evidence_map = {
    e['source']: e['target']
    for e in edges
    if e.get('type') == 'evidence_for'
}

# 2) Filtra solo eventos
event_nodes = [n for n in nodes if n['type'] == 'Event']

rows = []
for ev in event_nodes:
    ev_id    = ev['id']
    timestamp= ev.get('timestamp')
    etype    = ev.get('label', ev.get('sub_type', ''))
    text     = ev.get('content') or ev.get('text') or ev.get('findings', '')

    # --- aquí tu lógica actual de sources y targets ---
    # …

    # 3) Nuevas columnas:
    # Si es un evento Communication guardo su id, si no dejo None
    comm_id = ev_id if etype.lower() == 'communication' else None

    # Busco el Monitoring al que evidencia (si existe)
    mon_id = evidence_map.get(ev_id)

    # Extraigo el campo findings del nodo Monitoring
    findings = None
    if mon_id and mon_id in node_map:
        findings = node_map[mon_id].get('findings')

    rows.append({
        'Date & Time':       timestamp,
        'Type':              etype,
        'Text':              text,
        'Sources':           ', '.join(sources),
        'Targets':           ', '.join(targets),
        # —— columnas nuevas —— 
        'Event_Communication': comm_id,
        'Event_Monitoring':    mon_id,
        'Findings':            findings,
    })

df_events = pd.DataFrame(rows)
df_events['Date & Time'] = pd.to_datetime(df_events['Date & Time'])
df_events = df_events.sort_values('Date & Time')

# Ya puedes ver todas las columnas:
#print(df_events.head())


In [None]:
df_events[(df_events['Type'] == 'Communication') ]

In [None]:
edge_id = "338"

parent = next(
    (k for k, v in data.items()
     if isinstance(v, list) and any(d.get("id") == edge_id for d in v)),
    None
)

if parent:
    print(f"El objeto con id={edge_id} está en la clave: '{parent}'")
else:
    print("No se encontró ningún objeto con ese id.")

In [2]:
import json
import pandas as pd

file_path = r'C:\Users\vdela\Documents\vast-challenge3-2025\data\MC3_graph.json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

nodes = data['nodes']
edges = data['edges']
node_map = {n['id']: n for n in nodes}

# Evidence-map (opcional)
evidence_map = {
    e['source']: e['target']
    for e in edges
    if e.get('type') == 'evidence_for'
}

# Filtra solo eventos
event_nodes = [n for n in nodes if n['type'] == 'Event']

rows = []
for ev in event_nodes:
    ev_id   = ev['id']
    timestamp = ev.get('timestamp')
    etype     = ev.get('label', ev.get('sub_type', ''))
    text      = ev.get('content') or ev.get('text') or ev.get('findings', '')

    # --- SOURCES: incoming edges desde ENTITIES únicamente ---
    src_edges = [
        e for e in edges
        if e.get('target') == ev_id
        and node_map.get(e['source'], {}).get('type') == 'Entity'
    ]
    sources = [
        f"{node_map[e['source']]['label']} ({node_map[e['source']]['sub_type']})"
        for e in src_edges
    ]
    # elimina duplicados conservando orden
    sources = list(dict.fromkeys(sources))

    # --- TARGETS: outgoing edges hacia ENTITIES únicamente ---
    tgt_edges = [
        e for e in edges
        if e.get('source') == ev_id
        and node_map.get(e['target'], {}).get('type') == 'Entity'
    ]
    targets = [
        f"{node_map[e['target']]['label']} ({node_map[e['target']]['sub_type']})"
        for e in tgt_edges
    ]
    targets = list(dict.fromkeys(targets))

    # Nuevas columnas: Comunicación → Monitoring → Findings
    comm_id = ev_id if etype.lower() == 'communication' else None
    mon_id  = evidence_map.get(ev_id)
    findings = node_map.get(mon_id, {}).get('findings') if mon_id else None

    rows.append({
        'Date & Time':        timestamp,
        'Type':               etype,
        'Text':               text,
        'Sources':            ', '.join(sources),
        'Targets':            ', '.join(targets),
        'Event_Communication': comm_id,
        'Event_Monitoring':    mon_id,
        'Findings':            findings,
    })

df_events = pd.DataFrame(rows)
df_events['Date & Time'] = pd.to_datetime(df_events['Date & Time'])
df_events.sort_values('Date & Time', inplace=True)

# Ahora:
#print(df_events[df_events['Type'] == 'Communication'][['Sources','Targets']].head())


In [3]:
df_events[df_events['Type'] == 'Communication'][['Sources','Targets']]

Unnamed: 0,Sources,Targets
70,The Lookout (Person),The Intern (Person)
71,The Intern (Person),The Lookout (Person)
72,Kelly (Person),Sam (Person)
73,The Intern (Person),Mrs. Money (Person)
74,Mrs. Money (Person),Boss (Person)
...,...,...
649,Oceanus City Council (Organization),Green Guardians (Organization)
650,Green Guardians (Organization),EcoVigil (Vessel)
651,Defender (Vessel),Mako (Vessel)
652,Knowles (Vessel),Davis (Person)


In [48]:
df_events[df_events['Type'] == 'Communication'][['Sources','Targets']]["Sources"].unique()

array(['The Lookout (Person)', 'The Intern (Person)', 'Kelly (Person)',
       'Mrs. Money (Person)', 'Boss (Person)', 'The Middleman (Person)',
       'Serenity (Vessel)', 'Mako (Vessel)', 'Himark Harbor (Location)',
       'Davis (Person)', 'Reef Guardian (Vessel)',
       'Paackland Harbor (Location)', 'Remora (Vessel)',
       'Oceanus City Council (Organization)', 'Liam Thorne (Person)',
       'Marlin (Vessel)', 'Green Guardians (Organization)',
       'Samantha Blake (Person)', 'Glitters Team (Organization)',
       'Sentinel (Vessel)', 'Horizon (Vessel)',
       'Haacklee Harbor (Location)', 'Osprey (Vessel)',
       'EcoVigil (Vessel)', 'Miranda Jordan (Person)',
       'Clepper Jensen (Person)', 'Rodriguez (Person)',
       'Small Fry (Person)', 'Defender (Vessel)',
       'V. Miesel Shipping (Organization)', 'Northern Light (Vessel)',
       'Sailor Shifts Team (Organization)', 'Nadia Conti (Person)',
       'Sam (Person)', 'Neptune (Vessel)', 'Elise (Person)',
       'Seawa

In [12]:
a.shape

(27, 8)

In [10]:
a = df_events[(df_events["Sources"] == 'Nadia Conti (Person)') | (df_events["Targets"] == 'Nadia Conti (Person)') ]
a.to_csv("a.csv")

In [7]:
df_events[(df_events["Sources"] == 'Nadia Conti (Person)') | (df_events["Targets"] == 'Nadia Conti (Person)') ]#["Text"]

Unnamed: 0,Date & Time,Type,Text,Sources,Targets,Event_Communication,Event_Monitoring,Findings
254,2040-10-05 09:44:00,Communication,"Haacklee Harbor to Nadia Conti. Following your visit yesterday regarding the Nemo Reef event logistics, we've prepared the necessary documentation. Harbor staff is ready to facilitate the special access corridor arrangements as discussed. Please confirm timeline for implementation.",Haacklee Harbor (Location),Nadia Conti (Person),Event_Communication_330,Relationship_AccessPermission_181,
255,2040-10-05 09:45:00,Communication,"Haacklee Harbor, this is Nadia Conti. I need to cancel the special access corridor arrangements for Nemo Reef immediately. Plans have changed due to unforeseen circumstances. Destroy all related documentation. I'll contact you when we're ready to proceed with alternative locations.",Nadia Conti (Person),Haacklee Harbor (Location),Event_Communication_331,Relationship_AccessPermission_181,
257,2040-10-05 09:48:00,Communication,"Ms. Conti, this is Oceanus City Council. We need clarification regarding your canceled Nemo Reef event arrangements at Haacklee Harbor. Please explain your documentation destruction request immediately. This relates to our newly expedited permit approvals.",Oceanus City Council (Organization),Nadia Conti (Person),Event_Communication_333,,
258,2040-10-05 09:49:00,Communication,This is Nadia Conti. My cancellation was due to scheduling conflicts with our tourism development initiatives. I wasn't aware of any permit approvals. I'll submit revised documentation for alternative sustainable tourism proposals next week.,Nadia Conti (Person),Oceanus City Council (Organization),Event_Communication_334,,
369,2040-10-07 11:57:00,Communication,"Hi Nadia, this is the Sailor Shifts Team. Received your message about permit assistance - thank you! We urgently need to discuss tomorrow's staffing requirements. Can you confirm how many additional crew members we should bring for the setup?",Sailor Shifts Team (Organization),Nadia Conti (Person),Event_Communication_520,,
370,2040-10-07 12:00:00,Communication,"Davis, Nadia here. Let's meet at 7PM at the marina office to review documentation. I've been working with alternative channels for permits. Bring all shipping manifests - we'll need to create a clean paper trail immediately.",Davis (Person),Nadia Conti (Person),Event_Communication_521,,
373,2040-10-08 08:15:00,Communication,"Nadia, Elise here. Meeting at Nemo Reef 0500 tomorrow to establish payment protocols. Sam uncovered V. Miesel shipping lanes overlapping with Mako by 40%. Neptune mentioned 'underwater foundation work' - outside our original scope. Need your assessment.",Elise (Person),Nadia Conti (Person),Event_Communication_528,Relationship_Colleagues_321,
374,2040-10-08 08:18:00,Communication,"Liam, Nadia here. Need your services urgently. Investigation brewing around Nemo Reef permits. Double your usual fee if you can ensure Harbor Master remains cooperative through next week. Meet at the usual place tomorrow, 10PM.",Nadia Conti (Person),Liam Thorne (Person),Event_Communication_529,Relationship_AccessPermission_313,
377,2040-10-08 08:24:00,Communication,"Nadia, Liam here. Meeting confirmed for tomorrow at 10PM. I've redirected Harbor Master's attention and implemented new patrol schedules that work in our favor. Council suspects nothing about Nemo Reef. Bring payment as discussed.",Liam Thorne (Person),Nadia Conti (Person),Event_Communication_535,Relationship_Colleagues_494,
378,2040-10-08 08:25:00,Communication,"Neptune, this is Nadia. Need clarity on 'underwater foundation work' at Nemo Reef. This extends beyond our agreed scope. Meet me at the marina tomorrow at 6AM to discuss implications and additional resource requirements.",Nadia Conti (Person),Neptune (Vessel),Event_Communication_536,,


In [8]:
len(df_events[(df_events["Sources"] == 'Nadia Conti (Person)') | (df_events["Targets"] == 'Nadia Conti (Person)') ]["Text"])

27

In [None]:
#Relationship_Suspicious_219 Event_Monitoring
#df_events[(df_events["Event_Monitoring"] == "Relationship_Suspicious_219 Event_Monitoring") ]
df_events[df_events["Event_Monitoring"].str.startswith("Relationship_S", na=False)]

In [None]:
len(df_events[df_events["Event_Monitoring"].str.startswith("Relationship_S", na=False)])

In [None]:
df_events[(df_events["Type"] == "Communication") & df_events["Findings"].notna() ] #.head(20)

In [None]:
len(df_events[(df_events["Type"] == "Communication") & df_events["Findings"].notna() ]) #, df_events.shape # Relationship_Suspicious_71

In [None]:
df_events.isnull().sum()

In [None]:
df_events[(df_events['Event_Monitoring'] == 'Event_Communication_99') ]

In [None]:
pd.value_counts(df_events["Event_Communication"])

In [None]:
df_events[(df_events["Type"] == "Communication") ].head(3)

In [None]:
sources_equals_targets = df_events["Sources"] == df_events["Targets"]

print(f"¿Hay filas donde Sources = Targets? {sources_equals_targets.any()}")
print(f"Número de filas donde Sources = Targets: {sources_equals_targets.sum()}")

In [None]:
filas_repetidas = df_events[sources_equals_targets]

In [None]:
filas_repetidas

In [5]:
a = df_events[(df_events["Sources"] == "The Lookout (Person)") & (df_events["Type"] == "Communication")] #["targets"].unique() #.head(3)
b = df_events[(df_events["Sources"] != "The Lookout (Person)") & (df_events["Targets"] != "The Lookout (Person)") & (df_events["Type"] == "Communication")] #["targets"].unique() #.head(3)

In [6]:
a.shape , b.shape , df_events[(df_events["Type"] == "Communication")].shape

((33, 8), (543, 8), (584, 8))

In [None]:
a.shape , b.shape , df_events[(df_events["Type"] == "Communication")].shape

In [None]:
a.head()

In [None]:
b.head()

In [None]:
b["Targets"].unique()

# SIMILITUDES

In [1]:
#!pip install seaborn

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class TextSimilarityAnalyzer:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Inicializar el analizador de similitud de texto
        
        Modelos recomendados:
        - 'all-MiniLM-L6-v2': Rápido y bueno para tareas generales
        - 'all-mpnet-base-v2': Mejor calidad pero más lento
        - 'paraphrase-MiniLM-L6-v2': Específico para paráfrasis
        """
        print(f"Cargando modelo: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name
    
    def preprocess_text(self, text):
        """Preprocesar texto para mejor análisis"""
        if pd.isna(text) or text == "":
            return ""
        
        # Convertir a string y limpiar
        text = str(text).strip()
        
        # Opcional: eliminar patrones específicos del dominio
        # text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', '', text)  # fechas
        # text = re.sub(r'\b\d{2}:\d{2}:\d{2}\b', '', text)  # horas
        
        return text
    
    def analyze_person_similarity(self, target_person_df, other_persons_df, 
                                 text_column='Text', source_column='Sources',
                                 top_k=10, similarity_threshold=0.7):
        """
        Analizar similitud entre una persona objetivo y otras personas
        
        Args:
            target_person_df: DataFrame con mensajes de la persona objetivo
            other_persons_df: DataFrame con mensajes de otras personas
            text_column: Nombre de la columna con el texto
            source_column: Nombre de la columna con el remitente
            top_k: Número de matches más similares a retornar
            similarity_threshold: Umbral mínimo de similitud
        """
        
        # Preprocesar textos
        print("Preprocesando textos...")
        target_texts = [self.preprocess_text(text) for text in target_person_df[text_column]]
        target_texts = [text for text in target_texts if text]  # Eliminar textos vacíos
        
        # Agrupar por persona emisora
        person_texts = defaultdict(list)
        for idx, row in other_persons_df.iterrows():
            text = self.preprocess_text(row[text_column])
            if text:
                person_texts[row[source_column]].append(text)
        
        if not target_texts:
            print("No hay textos válidos en el dataset objetivo")
            return None
        
        # Generar embeddings para textos objetivo
        print(f"Generando embeddings para {len(target_texts)} textos objetivo...")
        target_embeddings = self.model.encode(target_texts, convert_to_tensor=True)
        
        results = []
        
        print(f"Analizando similitud con {len(person_texts)} personas...")
        
        for person, texts in person_texts.items():
            if not texts:
                continue
                
            print(f"Procesando: {person} ({len(texts)} mensajes)")
            
            # Generar embeddings para esta persona
            person_embeddings = self.model.encode(texts, convert_to_tensor=True)
            
            # Calcular similitudes
            similarities = util.cos_sim(target_embeddings, person_embeddings)
            
            # Estadísticas de similitud
            max_similarity = float(similarities.max())
            mean_similarity = float(similarities.mean())
            std_similarity = float(similarities.std())
            
            # Encontrar los matches más similares
            top_matches = []
            flat_similarities = similarities.flatten()
            top_indices = flat_similarities.argsort(descending=True)[:top_k]
            
            for idx in top_indices:
                target_idx = idx // len(texts)
                person_idx = idx % len(texts)
                similarity_score = float(flat_similarities[idx])
                
                if similarity_score >= similarity_threshold:
                    top_matches.append({
                        'target_text': target_texts[target_idx][:200] + "...",
                        'person_text': texts[person_idx][:200] + "...",
                        'similarity': similarity_score
                    })
            
            results.append({
                'person': person,
                'message_count': len(texts),
                'max_similarity': max_similarity,
                'mean_similarity': mean_similarity,
                'std_similarity': std_similarity,
                'high_similarity_matches': len([s for s in flat_similarities if s >= similarity_threshold]),
                'top_matches': top_matches
            })
        
        # Ordenar por similitud máxima
        results.sort(key=lambda x: x['max_similarity'], reverse=True)
        
        return results
    
    def create_similarity_report(self, results, target_person_name):
        """Crear reporte detallado de similitudes"""
        
        print(f"\n{'='*80}")
        print(f"REPORTE DE SIMILITUD PARA: {target_person_name}")
        print(f"Modelo utilizado: {self.model_name}")
        print(f"{'='*80}")
        
        # Resumen general
        print(f"\nRESUMEN GENERAL:")
        print(f"Total de personas analizadas: {len(results)}")
        
        high_similarity_persons = [r for r in results if r['max_similarity'] > 0.8]
        medium_similarity_persons = [r for r in results if 0.6 <= r['max_similarity'] <= 0.8]
        
        print(f"Personas con alta similitud (>0.8): {len(high_similarity_persons)}")
        print(f"Personas con similitud media (0.6-0.8): {len(medium_similarity_persons)}")
        
        # Top 10 personas más similares
        print(f"\nTOP 10 PERSONAS MÁS SIMILARES:")
        print("-" * 100)
        print(f"{'Rank':<4} {'Persona':<30} {'Msgs':<6} {'Max Sim':<8} {'Avg Sim':<8} {'Matches >0.7':<12}")
        print("-" * 100)
        
        for i, result in enumerate(results[:10]):
            print(f"{i+1:<4} {result['person']:<30} {result['message_count']:<6} "
                  f"{result['max_similarity']:.3f}    {result['mean_similarity']:.3f}    "
                  f"{result['high_similarity_matches']:<12}")
        
        # Detalles de los más sospechosos
        print(f"\nDETALLES DE CASOS SOSPECHOSOS (Similitud > 0.75):")
        print("=" * 100)
        
        suspicious_cases = [r for r in results if r['max_similarity'] > 0.75]
        
        for result in suspicious_cases:
            print(f"\n🚨 PERSONA: {result['person']}")
            print(f"   Similitud máxima: {result['max_similarity']:.3f}")
            print(f"   Similitud promedio: {result['mean_similarity']:.3f}")
            print(f"   Matches con alta similitud: {result['high_similarity_matches']}")
            
            if result['top_matches']:
                print(f"   Top matches:")
                for match in result['top_matches'][:3]:
                    print(f"     • Similitud: {match['similarity']:.3f}")
                    print(f"       {target_person_name}: {match['target_text']}")
                    print(f"       {result['person']}: {match['person_text']}")
                    print()
        
        return results
    
    def plot_similarity_distribution(self, results, target_person_name):
        """Crear visualizaciones de la distribución de similitudes"""
        
        # Extraer datos para visualización
        persons = [r['person'] for r in results[:15]]  # Top 15
        max_similarities = [r['max_similarity'] for r in results[:15]]
        mean_similarities = [r['mean_similarity'] for r in results[:15]]
        
        # Crear figura con subplots
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
        
        # Gráfico 1: Similitudes máximas
        bars1 = ax1.barh(persons, max_similarities, color='red', alpha=0.7)
        ax1.set_xlabel('Similitud Máxima')
        ax1.set_title(f'Top 15 Personas - Similitud Máxima con {target_person_name}')
        ax1.set_xlim(0, 1)
        
        # Agregar línea de referencia
        ax1.axvline(x=0.8, color='red', linestyle='--', alpha=0.5, label='Umbral Alto (0.8)')
        ax1.axvline(x=0.7, color='orange', linestyle='--', alpha=0.5, label='Umbral Medio (0.7)')
        ax1.legend()
        
        # Agregar valores en las barras
        for bar, value in zip(bars1, max_similarities):
            ax1.text(value + 0.01, bar.get_y() + bar.get_height()/2, 
                    f'{value:.3f}', va='center', fontsize=8)
        
        # Gráfico 2: Similitudes promedio
        bars2 = ax2.barh(persons, mean_similarities, color='blue', alpha=0.7)
        ax2.set_xlabel('Similitud Promedio')
        ax2.set_title(f'Top 15 Personas - Similitud Promedio con {target_person_name}')
        ax2.set_xlim(0, 1)
        
        # Agregar valores en las barras
        for bar, value in zip(bars2, mean_similarities):
            ax2.text(value + 0.01, bar.get_y() + bar.get_height()/2, 
                    f'{value:.3f}', va='center', fontsize=8)
        
        plt.tight_layout()
        plt.show()

# Función principal para ejecutar el análisis
def run_similarity_analysis(df_events, target_person="The Lookout (Person)"):
    """
    Ejecutar análisis completo de similitud
    """
    
    # Crear datasets
    print(f"Creando datasets para análisis de: {target_person}")
    
    a = df_events[(df_events["Sources"] == target_person) & 
                  (df_events["Type"] == "Communication")]
    
    b = df_events[(df_events["Sources"] != target_person) & 
                  (df_events["Targets"] != target_person) & 
                  (df_events["Type"] == "Communication")]
    
    print(f"Dataset A ({target_person}): {len(a)} mensajes")
    print(f"Dataset B (otras personas): {len(b)} mensajes")
    print(f"Personas únicas en dataset B: {b['Sources'].nunique()}")
    
    if len(a) == 0:
        print(f"No se encontraron mensajes para {target_person}")
        return None
    
    if len(b) == 0:
        print("No se encontraron mensajes de otras personas")
        return None
    
    # Inicializar analizador
    analyzer = TextSimilarityAnalyzer('all-MiniLM-L6-v2')
    
    # Ejecutar análisis
    results = analyzer.analyze_person_similarity(
        target_person_df=a,
        other_persons_df=b,
        similarity_threshold=0.7,
        top_k=5
    )
    
    if results:
        # Crear reporte
        analyzer.create_similarity_report(results, target_person)
        
        # Crear visualizaciones
        analyzer.plot_similarity_distribution(results, target_person)
        
        return results
    
    return None

# Ejemplo de uso:
# results = run_similarity_analysis(df_events, "The Lookout (Person)")

ModuleNotFoundError: No module named 'seaborn'

In [39]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

def analyze_similarity_simple(df_events, target_person="The Lookout (Person)"):
    """Análisis de similitud simplificado"""
    
    # Crear datasets
    print(f"Analizando similitud para: {target_person}")
    
    a = df_events[(df_events["Sources"] == target_person) & 
                  (df_events["Type"] == "Communication")]
    
    b = df_events[(df_events["Sources"] != target_person) & 
                  (df_events["Targets"] != target_person) & 
                  (df_events["Type"] == "Communication")]
    
    print(f"Dataset A ({target_person}): {len(a)} mensajes")
    print(f"Dataset B (otras personas): {len(b)} mensajes")
    print(f"Personas únicas en dataset B: {b['Sources'].nunique()}")
    
    if len(a) == 0 or len(b) == 0:
        print("No hay suficientes datos para el análisis")
        return None
    
    # Cargar modelo
    print("Cargando modelo SentenceTransformer...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Preparar textos objetivo
    target_texts = [str(text) for text in a["Text"].dropna() if str(text).strip()]
    if not target_texts:
        print("No hay textos válidos en dataset objetivo")
        return None
    
    print(f"Procesando {len(target_texts)} textos objetivo...")
    
    # Generar embeddings objetivo
    target_embeddings = model.encode(target_texts)
    
    # Agrupar por persona
    person_texts = defaultdict(list)
    for _, row in b.iterrows():
        if pd.notna(row["Text"]) and str(row["Text"]).strip():
            person_texts[row["Sources"]].append(str(row["Text"]))
    
    # Filtrar personas con al menos 2 mensajes
    person_texts = {person: texts for person, texts in person_texts.items() 
                   if len(texts) >= 2}
    
    print(f"Analizando {len(person_texts)} personas con suficientes mensajes...")
    
    results = []
    for person, texts in person_texts.items():
        print(f"Procesando: {person} ({len(texts)} mensajes)")
        
        # Generar embeddings para esta persona
        person_embeddings = model.encode(texts)
        
        # Calcular similitudes
        similarities = util.cos_sim(target_embeddings, person_embeddings)
        # CALCULAR TOTAL DE COMPARACIONES
        total_comparisons = len(target_texts) * len(texts) 
        # Estadísticas
        max_similarity = float(similarities.max())
        mean_similarity = float(similarities.mean())
        std_similarity = float(similarities.std())
        
        # Contar matches con alta similitud
        high_sim_matches = int((similarities > 0.7).sum())
        very_high_sim_matches = int((similarities > 0.8).sum())
        
        results.append({
            'person': person,
            'message_count': len(texts),
            'target_message_count': len(target_texts),
            'total_comparisons': total_comparisons, 
            'max_similarity': max_similarity,
            'mean_similarity': mean_similarity,
            'std_similarity': std_similarity,
            'high_sim_matches': high_sim_matches,
            'very_high_sim_matches': very_high_sim_matches   
        })
    
    # Ordenar por similitud máxima
    results.sort(key=lambda x: x['max_similarity'], reverse=True)
    
    # Mostrar resultados
    print(f"\n{'='*80}")
    print(f"RESULTADOS DE SIMILITUD PARA: {target_person}")
    print(f"{'='*80}")
    
    print(f"\nTOP 15 PERSONAS MÁS SIMILARES:")
    print("-" * 90)
    print(f"{'#':<3} {'Persona':<25} {'Msgs':<5} {'Max':<6} {'Avg':<6} {'>0.7':<5} {'>0.8':<5}")
    print("-" * 90)
    
    for i, r in enumerate(results[:15]):
        print(f"{i+1:<3} {r['person'][:24]:<25} {r['message_count']:<5} "
              f"{r['max_similarity']:.3f}  {r['mean_similarity']:.3f}  "
              f"{r['high_sim_matches']:<5} {r['very_high_sim_matches']:<5}")
    
    return results

# Ejecutar análisis
results = analyze_similarity_simple(df_events, "The Lookout (Person)")

Analizando similitud para: The Lookout (Person)
Dataset A (The Lookout (Person)): 33 mensajes
Dataset B (otras personas): 543 mensajes
Personas únicas en dataset B: 38
Cargando modelo SentenceTransformer...
Procesando 33 textos objetivo...
Analizando 37 personas con suficientes mensajes...
Procesando: Kelly (Person) (2 mensajes)
Procesando: The Intern (Person) (16 mensajes)
Procesando: Mrs. Money (Person) (19 mensajes)
Procesando: Boss (Person) (9 mensajes)
Procesando: The Middleman (Person) (13 mensajes)
Procesando: Serenity (Vessel) (8 mensajes)
Procesando: Mako (Vessel) (35 mensajes)
Procesando: Himark Harbor (Location) (27 mensajes)
Procesando: Davis (Person) (25 mensajes)
Procesando: Reef Guardian (Vessel) (34 mensajes)
Procesando: Paackland Harbor (Location) (16 mensajes)
Procesando: Remora (Vessel) (25 mensajes)
Procesando: Oceanus City Council (Organization) (37 mensajes)
Procesando: Liam Thorne (Person) (11 mensajes)
Procesando: Marlin (Vessel) (7 mensajes)
Procesando: Green G

In [37]:
df_events[(df_events["Type"] == "Communication")  & (df_events["Sources"] == "The Lookout (Person)") ]["Targets"].unique() #& (df_events["Targets"] == "Kelly (Person)")

array(['The Intern (Person)', 'Sam (Person)', 'Sentinel (Vessel)',
       'Horizon (Vessel)', 'Green Guardians (Organization)',
       'Reef Guardian (Vessel)'], dtype=object)

In [43]:
df_events[(df_events["Type"] == "Communication") & (df_events["Sources"] == "The Lookout (Person)") ].shape #["Targets"].unique() # & (df_events["Findings"].notna())  & (df_events["Targets"] == "Kelly (Person)")

(33, 8)

In [None]:
df_events[(df_events["Type"] == "Communication") & (df_events["Sources"] != "The Lookout (Person)")& (df_events["Sources"] != "The Lookout (Person)") & (df_events["Targets"] == "Sentinel (Vessel)") ] #["Targets"].unique() # & (df_events["Findings"].notna())  & (df_events["Targets"] == "Kelly (Person)")

In [None]:
results


594

In [40]:
for i in results:
    print(i)

{'person': 'Sentinel (Vessel)', 'message_count': 18, 'target_message_count': 33, 'total_comparisons': 594, 'max_similarity': 0.830532968044281, 'mean_similarity': 0.4825107157230377, 'std_similarity': 0.12133296579122543, 'high_sim_matches': 17, 'very_high_sim_matches': 2}
{'person': 'Green Guardians (Organization)', 'message_count': 44, 'target_message_count': 33, 'total_comparisons': 1452, 'max_similarity': 0.812926173210144, 'mean_similarity': 0.48835813999176025, 'std_similarity': 0.12975458800792694, 'high_sim_matches': 46, 'very_high_sim_matches': 1}
{'person': 'Miranda Jordan (Person)', 'message_count': 18, 'target_message_count': 33, 'total_comparisons': 594, 'max_similarity': 0.783218502998352, 'mean_similarity': 0.39647233486175537, 'std_similarity': 0.13147194683551788, 'high_sim_matches': 8, 'very_high_sim_matches': 0}
{'person': 'Clepper Jensen (Person)', 'message_count': 20, 'target_message_count': 33, 'total_comparisons': 660, 'max_similarity': 0.7821876406669617, 'mean_

In [44]:
df_results = pd.DataFrame(results)


In [45]:
df_results

Unnamed: 0,person,message_count,target_message_count,total_comparisons,max_similarity,mean_similarity,std_similarity,high_sim_matches,very_high_sim_matches
0,Sentinel (Vessel),18,33,594,0.830533,0.482511,0.121333,17,2
1,Green Guardians (Organization),44,33,1452,0.812926,0.488358,0.129755,46,1
2,Miranda Jordan (Person),18,33,594,0.783219,0.396472,0.131472,8,0
3,Clepper Jensen (Person),20,33,660,0.782188,0.341552,0.133321,4,0
4,The Intern (Person),16,33,528,0.778218,0.474534,0.12596,20,0
5,Liam Thorne (Person),11,33,363,0.767456,0.428653,0.137331,6,0
6,Paackland Harbor (Location),16,33,528,0.760566,0.441832,0.134317,6,0
7,Reef Guardian (Vessel),34,33,1122,0.757897,0.492351,0.113966,18,0
8,Oceanus City Council (Organization),37,33,1221,0.756792,0.428181,0.126599,19,0
9,Neptune (Vessel),34,33,1122,0.753725,0.313302,0.111998,2,0


In [11]:
results[0], len(results)

({'person': 'Sentinel (Vessel)',
  'message_count': 18,
  'max_similarity': 0.830532968044281,
  'mean_similarity': 0.4825107157230377,
  'std_similarity': 0.12133296579122543,
  'high_sim_matches': 17,
  'very_high_sim_matches': 2},
 37)

In [None]:
df_events["Findings"].value_counts()

In [None]:
df_events["Findings"].unique()

In [None]:
df_events["Targets"].unique()