In [None]:
!pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, pyvis
Successfully installed jedi-0.19.2 pyvis-0.3.2


In [None]:
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m1.4/1.9 MB[0m [31m40.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()
random.seed(42)
Faker.seed(42)

# create shared pool of users with real names
NUM_USERS = 150
users = []
for i in range(NUM_USERS):
    name = fake.name()
    user_id = f"user_{i+1}"
    users.append({"id": user_id, "name": name})

users_df = pd.DataFrame(users)

# helper: get fuzzy variants of names
def name_variant(name):
    if random.random() < 0.3:
        parts = name.split()
        return f"{parts[0][0]}. {parts[-1]}"  # e.g., J. Smith
    elif random.random() < 0.5:
        return name.replace("e", "a")  # small typo
    return name

# --- SOCIAL MEDIA DATASET ---
def generate_social_media_data(n=50):
    data = []
    for _ in range(n):
        sender = random.choice(users)
        receiver = random.choice([u for u in users if u != sender])
        timestamp = fake.date_time_between(start_date='-1y', end_date='now')
        message = fake.sentence(nb_words=6)
        data.append([
            sender["id"], name_variant(sender["name"]),
            receiver["id"], name_variant(receiver["name"]),
            timestamp, message
        ])
    return pd.DataFrame(data, columns=["sender_id", "sender_name", "receiver_id", "receiver_name", "timestamp", "message"])

# --- TELECOM LOGS DATASET ---
def generate_telecom_logs_data(n=50):
    data = []
    for _ in range(n):
        caller = random.choice(users)
        receiver = random.choice([u for u in users if u != caller])
        duration = random.randint(10, 600)
        timestamp = fake.date_time_between(start_date='-1y', end_date='now')
        call_type = random.choice(["voice", "sms", "video"])
        data.append([
            caller["id"], name_variant(caller["name"]),
            receiver["id"], name_variant(receiver["name"]),
            duration, timestamp, call_type
        ])
    return pd.DataFrame(data, columns=["caller_id", "caller_name", "receiver_id", "receiver_name", "duration", "timestamp", "call_type"])

# --- INCIDENT REPORTS DATASET ---
def generate_incident_reports_data(n=50):
    data = []
    for _ in range(n):
        reporter = random.choice(users)
        suspect = random.choice([u for u in users if u != reporter])
        incident_type = random.choice(["hoax call", "spam", "threat", "suspicious"])
        location = fake.city()
        timestamp = fake.date_time_between(start_date='-1y', end_date='now')
        data.append([
            reporter["id"], name_variant(reporter["name"]),
            suspect["id"], name_variant(suspect["name"]),
            incident_type, location, timestamp
        ])
    return pd.DataFrame(data, columns=["reporter_id", "reporter_name", "suspect_id", "suspect_name", "incident_type", "location", "timestamp"])


# Generate datasets
social_df = generate_social_media_data()
telecom_df = generate_telecom_logs_data()
incident_df = generate_incident_reports_data()


In [2]:


# Save to CSVs
social_df.to_csv("social_media.csv", index=False)
telecom_df.to_csv("telecom_logs.csv", index=False)
incident_df.to_csv("incident_reports.csv", index=False)

# Save full user mapping (optional)
# users_df.to_csv("user_master.csv", index=False)

print("✅ Enhanced datasets generated successfully.")


✅ Enhanced datasets generated successfully.


In [None]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
from IPython.display import display, HTML

# Load datasets
incident_df = pd.read_csv('incident_reports.csv')
social_df = pd.read_csv('social_media.csv')
telecom_df = pd.read_csv('telecom_logs.csv')

# Clean & standardize columns
incident_df = incident_df.rename(columns={
    'reporter_id': 'source', 'reporter_name': 'source_name',
    'suspect_id': 'target', 'suspect_name': 'target_name'
})

# suspect_id -> incident_type

suspect_type = {}
for _, row in incident_df.iterrows():
    suspect_type[row['target']] = row['incident_type']
print(suspect_type)

incident_df['layer'] = 'incident'

social_df = social_df.rename(columns={
    'sender_id': 'source', 'sender_name': 'source_name',
    'receiver_id': 'target', 'receiver_name': 'target_name'
})
social_df['layer'] = 'social_media'

telecom_df = telecom_df.rename(columns={
    'caller_id': 'source', 'caller_name': 'source_name',
    'receiver_id': 'target', 'receiver_name': 'target_name'
})
telecom_df['layer'] = 'telecom'

# Build incident graph
incident_graph = nx.DiGraph()
for _, row in incident_df.iterrows():
    incident_graph.add_node(row['source'], label=row['source_name'], layer='incident')
    incident_graph.add_node(row['target'], label=row['target_name'], layer='incident')
    incident_graph.add_edge(
        row['source'], row['target'],
        layer=row['layer'],
        incident_type=row['incident_type'],
        location=row['location'],
        timestamp=row['timestamp']
    )

# Build social media graph
social_graph = nx.DiGraph()
for _, row in social_df.iterrows():
    social_graph.add_node(row['source'], label=row['source_name'], layer='social_media')
    social_graph.add_node(row['target'], label=row['target_name'], layer='social_media')
    social_graph.add_edge(
        row['source'], row['target'],
        layer=row['layer'],
        message=row['message'],
        timestamp=row['timestamp']
    )

# Build telecom graph
telecom_graph = nx.DiGraph()
for _, row in telecom_df.iterrows():
    telecom_graph.add_node(row['source'], label=row['source_name'], layer='telecom')
    telecom_graph.add_node(row['target'], label=row['target_name'], layer='telecom')
    telecom_graph.add_edge(
        row['source'], row['target'],
        layer=row['layer'],
        duration=row['duration'],
        call_type=row['call_type'],
        timestamp=row['timestamp']
    )

# Print summary
print("Incident Graph:", incident_graph.number_of_nodes(), "nodes,", incident_graph.number_of_edges(), "edges")
print("Social Media Graph:", social_graph.number_of_nodes(), "nodes,", social_graph.number_of_edges(), "edges")
print("Telecom Graph:", telecom_graph.number_of_nodes(), "nodes,", telecom_graph.number_of_edges(), "edges")

# Function to visualize a graph using pyvis
def visualize_graph(graph, name):
    net = Network(height="500px", width="100%", notebook=True, directed=True)
    for node, data in graph.nodes(data=True):
        net.add_node(node, label=data.get("label", node), title=f"ID: {node}", group=data.get("layer"))
    for source, target, data in graph.edges(data=True):
        net.add_edge(source, target, title=str(data), color="gray")
    net.repulsion(node_distance=120, spring_length=200)
    net.show(f"{name}.html")
    display(HTML(f"{name}.html"))

# Visualize graphs
visualize_graph(incident_graph, "incident_graph")
visualize_graph(social_graph, "social_graph")
visualize_graph(telecom_graph, "telecom_graph")


{'user_84': 'spam', 'user_71': 'hoax call', 'user_127': 'spam', 'user_77': 'spam', 'user_136': 'threat', 'user_80': 'threat', 'user_48': 'spam', 'user_73': 'suspicious', 'user_33': 'suspicious', 'user_122': 'suspicious', 'user_30': 'hoax call', 'user_146': 'threat', 'user_107': 'spam', 'user_79': 'hoax call', 'user_69': 'hoax call', 'user_42': 'hoax call', 'user_51': 'suspicious', 'user_70': 'hoax call', 'user_147': 'threat', 'user_130': 'suspicious', 'user_7': 'hoax call', 'user_47': 'suspicious', 'user_106': 'threat', 'user_128': 'threat', 'user_82': 'threat', 'user_140': 'suspicious', 'user_14': 'spam', 'user_114': 'hoax call', 'user_41': 'threat', 'user_120': 'hoax call', 'user_108': 'suspicious', 'user_35': 'hoax call', 'user_126': 'spam', 'user_118': 'threat', 'user_60': 'suspicious', 'user_100': 'spam', 'user_20': 'suspicious', 'user_86': 'suspicious', 'user_10': 'hoax call', 'user_24': 'suspicious', 'user_78': 'hoax call', 'user_112': 'spam', 'user_44': 'spam', 'user_129': 'spa

social_graph.html


telecom_graph.html


In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
from fuzzywuzzy import fuzz
from collections import defaultdict

# Define a function for fuzzy name matching
def fuzzy_match(name1, name2, threshold=80):
    """Compares two names using fuzzywuzzy and returns True if match is above threshold."""
    score = fuzz.ratio(name1, name2)
    return score >= threshold

# Collect all user IDs from all sources
all_users = set(incident_df['source']) | set(incident_df['target']) | \
            set(social_df['source']) | set(social_df['target']) | \
            set(telecom_df['source']) | set(telecom_df['target'])

# Map user ID to name(s) from each dataset
id_to_names = defaultdict(set)

for _, row in incident_df.iterrows():
    id_to_names[row['source']].add(row['source_name'])
    id_to_names[row['target']].add(row['target_name'])

for _, row in social_df.iterrows():
    id_to_names[row['source']].add(row['source_name'])
    id_to_names[row['target']].add(row['target_name'])

for _, row in telecom_df.iterrows():
    id_to_names[row['source']].add(row['source_name'])
    id_to_names[row['target']].add(row['target_name'])

# Exact and fuzzy-match based resolution
resolved_entities = {}
confidence_scores = {}

for user in all_users:
    # Get names from all datasets for this user
    names_in_datasets = list(id_to_names[user])

    # Resolve entities by fuzzy matching
    resolved_name = names_in_datasets[0]  # Start with the first name
    match_found = False

    # Compare each name to see if we have a match (fuzzy matching)
    for other_name in names_in_datasets[1:]:
        if fuzzy_match(resolved_name, other_name):
            match_found = True
            resolved_name = resolved_name  # Keep the first name as the resolved name

    # If match_found is True, assign the unified ID as the resolved name
    resolved_entities[user] = resolved_name
    confidence_scores[user] = round(len(names_in_datasets) / 3, 2)  # max confidence is 1

# Print sample of resolved identities with confidence and names
print("Sample resolved entities with confidence and names:")
for i, (user, conf) in enumerate(confidence_scores.items()):
    names = ", ".join(id_to_names[user])
    print(f"{user} ({names}) -> unified_id: {resolved_entities[user]}, confidence: {conf}")
    if i > 10:
        break


Sample resolved entities with confidence and names:
user_97 (Aaron Wise) -> unified_id: Aaron Wise, confidence: 0.33
user_72 (Whitney Peters, W. Peters, Whitnay Patars) -> unified_id: Whitney Peters, confidence: 1.0
user_31 (Rebecca Henderson) -> unified_id: Rebecca Henderson, confidence: 0.33
user_26 (Judy Bakar, Judy Baker) -> unified_id: Judy Bakar, confidence: 0.67
user_100 (Staphan Mckaa, Stephen Mckee) -> unified_id: Staphan Mckaa, confidence: 0.67
user_67 (T. House) -> unified_id: T. House, confidence: 0.33
user_81 (K. Donovan, Kelly Donovan, Kally Donovan) -> unified_id: K. Donovan, confidence: 1.0
user_62 (Gerald Hensley, G. Hensley, Garald Hanslay) -> unified_id: Gerald Hensley, confidence: 1.0
user_34 (Nathan Maldonado) -> unified_id: Nathan Maldonado, confidence: 0.33
user_45 (Angalica Tuckar) -> unified_id: Angalica Tuckar, confidence: 0.33
user_42 (Frad Smith, F. Smith) -> unified_id: Frad Smith, confidence: 0.67
user_16 (B. Ramirez, Brian Ramirez) -> unified_id: B. Ramir

In [None]:
# Function to add confidence and layer info to nodes and edges
def add_confidence_and_layers_to_graph(graph, confidence_scores, layer_tag):
    for node in graph.nodes:
        graph.nodes[node]['confidence'] = confidence_scores.get(node, 0.0)

    for edge in graph.edges:
        graph.edges[edge]['layer'] = layer_tag

# Apply to all graphs
add_confidence_and_layers_to_graph(incident_graph, confidence_scores, 'incident')
add_confidence_and_layers_to_graph(social_graph, confidence_scores, 'social_media')
add_confidence_and_layers_to_graph(telecom_graph, confidence_scores, 'telecom')

# ✅ Updated visualization function
def visualize_graph_with_confidence_and_layers(graph, name, resolved_entities):
    net = Network(height="600px", width="100%", notebook=True, directed=True)

    for node in graph.nodes(data=True):
        node_id = node[0]
        attrs = node[1]
        resolved_name = resolved_entities.get(node_id, node_id)
        confidence = attrs.get('confidence', 0.0)

        label = f"{resolved_name} ({confidence})"
        title = f"Resolved Name: {resolved_name}, Confidence: {confidence}"

        net.add_node(node_id, label=label, title=title)

    for source, target, attrs in graph.edges(data=True):
        layer = attrs.get('layer', 'unknown')
        edge_label = f"{layer}"
        edge_title = ", ".join([f"{k}: {v}" for k, v in attrs.items()])
        net.add_edge(source, target, label=edge_label, title=edge_title)

    net.repulsion(node_distance=120, spring_length=200)
    net.show(f"{name}.html")
    display(HTML(f"{name}.html"))

# Call the visualization functions
visualize_graph_with_confidence_and_layers(incident_graph, "incident_graph_with_confidence_and_layers", resolved_entities)
visualize_graph_with_confidence_and_layers(social_graph, "social_graph_with_confidence_and_layers", resolved_entities)
visualize_graph_with_confidence_and_layers(telecom_graph, "telecom_graph_with_confidence_and_layers", resolved_entities)


incident_graph_with_confidence_and_layers.html


social_graph_with_confidence_and_layers.html


telecom_graph_with_confidence_and_layers.html


In [None]:
import networkx as nx
from pyvis.network import Network
from IPython.display import HTML, display
import pandas as pd
from collections import defaultdict

# Priority for resolving source conflicts
SOURCE_PRIORITY = {
    'incident': 3,
    'telecom': 2,
    'social_media': 1
}

# Behavior severity priority
BEHAVIOR_PRIORITY = {
    'suspicious': 2,
    'spam': 3,
    'threat': 4,
    'hoax call': 5
}

# Resolve incident type or behavior label based on severity
def resolve_behavior_label(values):
    if not values:
        return None
    values.sort(key=lambda x: -BEHAVIOR_PRIORITY.get(x[0], 0))
    return values[0][0]

# Resolve attributes based on source priority
def resolve_attribute(values):
    if not values:
        return None
    values.sort(key=lambda x: -SOURCE_PRIORITY.get(x[1], 0))
    return values[0][0]

# Generate a user ID → name mapping
def create_user_name_mapping(incident_df, social_df, telecom_df):
    mapping = {}
    for df, id_cols, name_cols in [
        (incident_df, ['reporter_id', 'suspect_id'], ['reporter_name', 'suspect_name']),
        (social_df, ['sender_id', 'receiver_id'], ['sender_name', 'receiver_name']),
        (telecom_df, ['caller_id', 'receiver_id'], ['caller_name', 'receiver_name'])
    ]:
        for id_col, name_col in zip(id_cols, name_cols):
            mapping.update(dict(zip(df[id_col], df[name_col])))
    return mapping

# Merge graphs with conflict resolution
def merge_graphs(incident_graph, telecom_graph, social_graph, incident_df, social_df, telecom_df):
    unified = nx.DiGraph()
    graphs = [(incident_graph, 'incident'), (telecom_graph, 'telecom'), (social_graph, 'social_media')]
    user_name_map = create_user_name_mapping(incident_df, social_df, telecom_df)

    # Collect node attributes from all graphs
    node_attrs = defaultdict(lambda: defaultdict(list))
    for graph, source in graphs:
        for node, attrs in graph.nodes(data=True):
            node_attrs[node]['confidence'].append((attrs.get('confidence', 0.0), source))
            node_attrs[node]['incident_type'].append((attrs.get('incident_type'), source))
            if 'behavior' in attrs:
                node_attrs[node]['behavior'].append((attrs['behavior'], source))

    # Add resolved nodes
    for node, attr_dict in node_attrs.items():
        unified.add_node(node)
        for attr, values in attr_dict.items():
            if attr == 'incident_type':
                resolved = resolve_behavior_label(values)
            else:
                resolved = resolve_attribute(values)
            if resolved is not None:
                unified.nodes[node][attr] = resolved

    # Merge edges and annotate layers
    color_map = {'incident': 'red', 'telecom': 'green', 'social_media': 'blue'}
    for graph, source in graphs:
        for u, v in graph.edges():
            if unified.has_edge(u, v):
                unified[u][v]['layers'].append(source)
            else:
                unified.add_edge(u, v, layers=[source])
            unified[u][v]['color'] = color_map[source]
            unified[u][v]['weight'] = {'incident': 3, 'telecom': 2, 'social_media': 1}[source]

    return unified

# Visualize the unified graph with PyVis
def visualize_unified_graph(graph, user_name_map, name="unified_graph"):
    net = Network(height="600px", width="100%", directed=True, notebook=True)
    color_map = {
        'hoax call': 'red',
        'spam': 'orange',
        'threat': 'purple',
        'suspicious': 'yellow'
    }

    for node, attrs in graph.nodes(data=True):
        print(node, attrs)
        label = user_name_map.get(node, node)
        title = f"Confidence: {attrs.get('confidence', 0.0)}"
        if 'incident_type' in attrs:
            title += f", Type: {attrs['incident_type']}"
        try:
            color = color_map.get(suspect_type[node], 'gray')
        except:
            color = "gray"
        value = attrs.get('confidence', 0.0) * 10
        net.add_node(node, label=label, title=title, color=color, value=value)

    for u, v, attrs in graph.edges(data=True):
        net.add_edge(u, v, title=", ".join(attrs['layers']), color=attrs['color'], width=attrs['weight'])

    net.repulsion(node_distance=120, spring_length=200)
    net.show(f"{name}.html")
    display(HTML(f"{name}.html"))

# Load datasets
incident_df = pd.read_csv('incident_reports.csv')
social_df = pd.read_csv('social_media.csv')
telecom_df = pd.read_csv('telecom_logs.csv')

# Build and merge graphs (assuming graphs exist)
# incident_graph = build_graph(incident_df, 'incident')
# telecom_graph = build_graph(telecom_df, 'telecom')
# social_graph = build_graph(social_df, 'social_media')

unified_graph = merge_graphs(incident_graph, telecom_graph, social_graph, incident_df, social_df, telecom_df)
user_name_mapping = create_user_name_mapping(incident_df, social_df, telecom_df)
visualize_unified_graph(unified_graph, user_name_mapping)


user_127 {'confidence': 0.67}
user_84 {'confidence': 0.67}
user_72 {'confidence': 1.0}
user_71 {'confidence': 0.67}
user_105 {'confidence': 0.67}
user_24 {'confidence': 0.67}
user_77 {'confidence': 0.67}
user_142 {'confidence': 0.33}
user_136 {'confidence': 0.67}
user_70 {'confidence': 1.0}
user_80 {'confidence': 0.67}
user_138 {'confidence': 1.0}
user_48 {'confidence': 0.33}
user_135 {'confidence': 0.33}
user_73 {'confidence': 0.67}
user_137 {'confidence': 0.33}
user_33 {'confidence': 0.67}
user_75 {'confidence': 0.67}
user_122 {'confidence': 1.0}
user_123 {'confidence': 0.33}
user_30 {'confidence': 0.67}
user_39 {'confidence': 0.67}
user_146 {'confidence': 0.33}
user_143 {'confidence': 1.0}
user_107 {'confidence': 0.67}
user_110 {'confidence': 0.67}
user_79 {'confidence': 0.67}
user_55 {'confidence': 0.67}
user_69 {'confidence': 0.33}
user_20 {'confidence': 0.33}
user_42 {'confidence': 0.67}
user_74 {'confidence': 0.67}
user_68 {'confidence': 1.0}
user_51 {'confidence': 0.67}
user_14

In [None]:
import networkx as nx
from pyvis.network import Network
from IPython.display import HTML, display
import pandas as pd
from collections import defaultdict


to_vis = ['incident','telecom', ]

mapping_fun = {
    'incident': incident_graph,
    'telecom': telecom_graph,
    'social_media': social_graph
}
# Priority for resolving source conflicts
SOURCE_PRIORITY = {
    'incident': 3,
    'telecom': 2,
    'social_media': 1
}

# Behavior severity priority
BEHAVIOR_PRIORITY = {
    'suspicious': 2,
    'spam': 3,
    'threat': 4,
    'hoax call': 5
}

# Resolve incident type or behavior label based on severity
def resolve_behavior_label(values):
    if not values:
        return None
    values.sort(key=lambda x: -BEHAVIOR_PRIORITY.get(x[0], 0))
    return values[0][0]

# Resolve attributes based on source priority
def resolve_attribute(values):
    if not values:
        return None
    values.sort(key=lambda x: -SOURCE_PRIORITY.get(x[1], 0))
    return values[0][0]

# Generate a user ID → name mapping
def create_user_name_mapping(incident_df, social_df, telecom_df):
    mapping = {}
    for df, id_cols, name_cols in [
        (incident_df, ['reporter_id', 'suspect_id'], ['reporter_name', 'suspect_name']),
        (social_df, ['sender_id', 'receiver_id'], ['sender_name', 'receiver_name']),
        (telecom_df, ['caller_id', 'receiver_id'], ['caller_name', 'receiver_name'])
    ]:
        for id_col, name_col in zip(id_cols, name_cols):
            mapping.update(dict(zip(df[id_col], df[name_col])))
    return mapping
user_name_map = create_user_name_mapping(incident_df, social_df, telecom_df)
# Merge graphs with conflict resolution
def merge_graphs():
    global user_name_map, mapping_fun, to_vis
    unified = nx.DiGraph()
    graphs = []
    for ele in to_vis:
        graphs.append((mapping_fun[ele], ele))


    # Collect node attributes from all graphs
    node_attrs = defaultdict(lambda: defaultdict(list))
    for graph, source in graphs:
        for node, attrs in graph.nodes(data=True):
            node_attrs[node]['confidence'].append((attrs.get('confidence', 0.0), source))
            node_attrs[node]['incident_type'].append((attrs.get('incident_type'), source))
            if 'behavior' in attrs:
                node_attrs[node]['behavior'].append((attrs['behavior'], source))

    # Add resolved nodes
    for node, attr_dict in node_attrs.items():
        unified.add_node(node)
        for attr, values in attr_dict.items():
            if attr == 'incident_type':
                resolved = resolve_behavior_label(values)
            else:
                resolved = resolve_attribute(values)
            if resolved is not None:
                unified.nodes[node][attr] = resolved

    # Merge edges and annotate layers
    color_map = {'incident': 'red', 'telecom': 'green', 'social_media': 'blue'}
    for graph, source in graphs:
        for u, v in graph.edges():
            if unified.has_edge(u, v):
                unified[u][v]['layers'].append(source)
            else:
                unified.add_edge(u, v, layers=[source])
            unified[u][v]['color'] = color_map[source]
            unified[u][v]['weight'] = {'incident': 3, 'telecom': 2, 'social_media': 1}[source]

    return unified

# Visualize the unified graph with PyVis
def visualize_unified_graph(graph, user_name_map, name="unified_graph"):
    net = Network(height="600px", width="100%", directed=True, notebook=True)
    color_map = {
        'hoax call': 'red',
        'spam': 'orange',
        'threat': 'purple',
        'suspicious': 'yellow'
    }

    for node, attrs in graph.nodes(data=True):
        print(node, attrs)
        label = user_name_map.get(node, node)
        title = f"Confidence: {attrs.get('confidence', 0.0)}"
        if 'incident_type' in attrs:
            title += f", Type: {attrs['incident_type']}"
        try:
            color = color_map.get(suspect_type[node], 'gray')
        except:
            color = "gray"
        value = attrs.get('confidence', 0.0) * 10
        net.add_node(node, label=label, title=title, color=color, value=value)

    for u, v, attrs in graph.edges(data=True):
        net.add_edge(u, v, title=", ".join(attrs['layers']), color=attrs['color'], width=attrs['weight'])

    net.repulsion(node_distance=120, spring_length=200)
    net.show(f"{name}.html")
    display(HTML(f"{name}.html"))

# Load datasets
incident_df = pd.read_csv('incident_reports.csv')
social_df = pd.read_csv('social_media.csv')
telecom_df = pd.read_csv('telecom_logs.csv')

# Build and merge graphs (assuming graphs exist)
# incident_graph = build_graph(incident_df, 'incident')
# telecom_graph = build_graph(telecom_df, 'telecom')
# social_graph = build_graph(social_df, 'social_media')

unified_graph = merge_graphs()
user_name_mapping = create_user_name_mapping(incident_df, social_df, telecom_df)
visualize_unified_graph(unified_graph, user_name_mapping)

user_127 {'confidence': 0.67}
user_84 {'confidence': 0.67}
user_72 {'confidence': 1.0}
user_71 {'confidence': 0.67}
user_105 {'confidence': 0.67}
user_24 {'confidence': 0.67}
user_77 {'confidence': 0.67}
user_142 {'confidence': 0.33}
user_136 {'confidence': 0.67}
user_70 {'confidence': 1.0}
user_80 {'confidence': 0.67}
user_138 {'confidence': 1.0}
user_48 {'confidence': 0.33}
user_135 {'confidence': 0.33}
user_73 {'confidence': 0.67}
user_137 {'confidence': 0.33}
user_33 {'confidence': 0.67}
user_75 {'confidence': 0.67}
user_122 {'confidence': 1.0}
user_123 {'confidence': 0.33}
user_30 {'confidence': 0.67}
user_39 {'confidence': 0.67}
user_146 {'confidence': 0.33}
user_143 {'confidence': 1.0}
user_107 {'confidence': 0.67}
user_110 {'confidence': 0.67}
user_79 {'confidence': 0.67}
user_55 {'confidence': 0.67}
user_69 {'confidence': 0.33}
user_20 {'confidence': 0.33}
user_42 {'confidence': 0.67}
user_74 {'confidence': 0.67}
user_68 {'confidence': 1.0}
user_51 {'confidence': 0.67}
user_14

In [None]:
def compute_centrality_measures(graph):
    centralities = {
        'degree': nx.degree_centrality(graph),
        'betweenness': nx.betweenness_centrality(graph),
        'closeness': nx.closeness_centrality(graph),
        'eigenvector': nx.eigenvector_centrality(graph, max_iter=500)
    }
    return centralities

In [None]:
centrality_scores = compute_centrality_measures(unified_graph)

# Print top 5 nodes by each centrality
for measure, scores in centrality_scores.items():
    print(f"\nTop 5 nodes by {measure} centrality:")
    top_nodes = sorted(scores.items(), key=lambda x: -x[1])[:5]
    for node, score in top_nodes:
        name = user_name_mapping.get(node, node)
        print(f"{name} ({node}): {score:.4f}")


Top 5 nodes by degree centrality:
P. Rodriguez (user_143): 0.0465
Margaret Hawkins DDS (user_21): 0.0465
William Bakar (user_70): 0.0388
Brittnay Phillips (user_73): 0.0388
C. Tucker (user_68): 0.0388

Top 5 nodes by betweenness centrality:
Kenneth Scott (user_122): 0.0213
John Pierce (user_47): 0.0169
Brittnay Phillips (user_73): 0.0159
Anthony Rodriguez (user_30): 0.0110
F. Smith (user_42): 0.0099

Top 5 nodes by closeness centrality:
Kenneth Scott (user_122): 0.0613
Lisa Jackson (user_19): 0.0603
Sharon Cherry (user_107): 0.0562
Anthony Rodriguez (user_30): 0.0554
B. Ramirez (user_16): 0.0554

Top 5 nodes by eigenvector centrality:
Sharon Cherry (user_107): 0.2982
D. May (user_116): 0.2982
B. Ramirez (user_16): 0.2981
Angalica Tuckar (user_45): 0.2981
Mary Marshall (user_133): 0.2981


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from collections import defaultdict
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import seaborn as sns

# The fuzzy matching function from your code
def fuzzy_match(name1, name2, threshold=80):
    """Compares two names using fuzzywuzzy and returns True if match is above threshold."""
    score = fuzz.ratio(name1, name2)
    return score >= threshold

# Return the actual score from fuzzy matching
def get_fuzzy_score(name1, name2):
    """Compares two names using fuzzywuzzy and returns the score."""
    return fuzz.ratio(name1, name2)

# Evaluate the fuzzy matching function with your datasets
def evaluate_fuzzy_matching(social_df, telecom_df, incident_df):
    """
    Evaluates the performance of fuzzy matching across datasets.
    Returns evaluation metrics and sample results for analysis.
    """
    # Create a user ID to name mapping
    user_id_to_name = create_user_name_mapping(social_df=social_df, telecom_df=telecom_df, incident_df=incident_df)

    # Create datasets for evaluation
    true_matches = []  # Known matching pairs (user_id, variant_name)
    test_pairs = []    # All name pairs to test

    # Function to extract evaluation data from a dataset
    def process_dataset(df, id_col1, name_col1, id_col2=None, name_col2=None):
        dataset_pairs = []
        dataset_matches = []

        if id_col2 is None:  # Single user case
            for _, row in df.iterrows():
                user_id = row[id_col1]
                variant_name = row[name_col1]
                true_name = user_id_to_name.get(user_id)

                if true_name:
                    # This is a known match (ground truth)
                    dataset_matches.append((true_name, variant_name, user_id))
                    # Add to test pairs
                    dataset_pairs.append((true_name, variant_name, True, user_id))

                    # Also add some negative examples (non-matches)
                    # Get 3 random different users
                    other_users = users_df[users_df['id'] != user_id].sample(min(3, len(users_df)-1))
                    for _, other_user in other_users.iterrows():
                        dataset_pairs.append((other_user['name'], variant_name, False, f"{user_id}_{other_user['id']}"))

        else:  # Dual user case (sender-receiver, caller-callee, etc.)
            for _, row in df.iterrows():
                # First user
                user1_id = row[id_col1]
                variant1_name = row[name_col1]
                true1_name = user_id_to_name.get(user1_id)

                # Second user
                user2_id = row[id_col2]
                variant2_name = row[name_col2]
                true2_name = user_id_to_name.get(user2_id)

                if true1_name:
                    # User 1 matches
                    dataset_matches.append((true1_name, variant1_name, user1_id))
                    dataset_pairs.append((true1_name, variant1_name, True, user1_id))

                if true2_name:
                    # User 2 matches
                    dataset_matches.append((true2_name, variant2_name, user2_id))
                    dataset_pairs.append((true2_name, variant2_name, True, user2_id))

                # Add some cross-match tests (should be negative)
                if true1_name and true2_name:
                    dataset_pairs.append((true1_name, variant2_name, False, f"{user1_id}_{user2_id}"))
                    dataset_pairs.append((true2_name, variant1_name, False, f"{user2_id}_{user1_id}"))

        return dataset_pairs, dataset_matches

    # Process each dataset
    social_pairs, social_matches = process_dataset(
        social_df, 'sender_id', 'sender_name', 'receiver_id', 'receiver_name')

    telecom_pairs, telecom_matches = process_dataset(
        telecom_df, 'caller_id', 'caller_name', 'receiver_id', 'receiver_name')

    incident_pairs, incident_matches = process_dataset(
        incident_df, 'reporter_id', 'reporter_name', 'suspect_id', 'suspect_name')

    # Combine all test data
    true_matches = social_matches + telecom_matches + incident_matches
    test_pairs = social_pairs + telecom_pairs + incident_pairs

    # Calculate scores for all test pairs
    scores = []
    for name1, name2, is_match, pair_id in test_pairs:
        score = get_fuzzy_score(name1, name2)
        scores.append({
            'name1': name1,
            'name2': name2,
            'true_match': is_match,
            'score': score,
            'pair_id': pair_id
        })

    scores_df = pd.DataFrame(scores)

    # Evaluate across different thresholds
    thresholds = range(30, 101, 5)  # 50, 55, 60, ..., 100
    results = {}

    for threshold in thresholds:
        # Predict matches based on threshold
        scores_df[f'predicted_{threshold}'] = scores_df['score'] >= threshold

        # Calculate metrics
        true_positives = sum((scores_df['true_match'] == True) & (scores_df[f'predicted_{threshold}'] == True))
        false_positives = sum((scores_df['true_match'] == False) & (scores_df[f'predicted_{threshold}'] == True))
        true_negatives = sum((scores_df['true_match'] == False) & (scores_df[f'predicted_{threshold}'] == False))
        false_negatives = sum((scores_df['true_match'] == True) & (scores_df[f'predicted_{threshold}'] == False))

        # Avoid division by zero
        total = true_positives + false_positives + true_negatives + false_negatives
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        accuracy = (true_positives + true_negatives) / total if total > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        results[threshold] = {
            'threshold': threshold,
            'true_positives': true_positives,
            'false_positives': false_positives,
            'true_negatives': true_negatives,
            'false_negatives': false_negatives,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy,
            'f1_score': f1
        }

    # Find optimal threshold based on F1 score
    results_df = pd.DataFrame(results).T
    optimal_threshold = results_df['f1_score'].idxmax()

    # Get some example results for review
    interesting_examples = {
        'false_positives': scores_df[(scores_df['true_match'] == False) &
                                    (scores_df[f'predicted_{optimal_threshold}'] == True)].head(5),
        'false_negatives': scores_df[(scores_df['true_match'] == True) &
                                    (scores_df[f'predicted_{optimal_threshold}'] == False)].head(5),
        'edge_cases': scores_df[(scores_df['score'] >= optimal_threshold - 10) &
                               (scores_df['score'] <= optimal_threshold + 10)].sample(min(5, len(scores_df)))
    }

    return {
        'metrics': results_df,
        'optimal_threshold': optimal_threshold,
        'scores': scores_df,
        'examples': interesting_examples
    }

# Function to visualize the evaluation results
def visualize_evaluation(evaluation_results):
    """Creates visualizations for the fuzzy matching evaluation."""
    metrics = evaluation_results['metrics']
    scores = evaluation_results['scores']
    optimal_threshold = evaluation_results['optimal_threshold']

    # Create a figure with subplots
    fig, axs = plt.subplots(2, 2, figsize=(18, 12))

    # Plot 1: Precision, Recall, F1 Score vs Threshold
    axs[0, 0].plot(metrics.index, metrics['precision'], 'b-', label='Precision')
    axs[0, 0].plot(metrics.index, metrics['recall'], 'g-', label='Recall')
    axs[0, 0].plot(metrics.index, metrics['f1_score'], 'r-', label='F1 Score')
    axs[0, 0].plot(metrics.index, metrics['accuracy'], 'y-', label='Accuracy')
    axs[0, 0].axvline(x=optimal_threshold, color='k', linestyle='--',
                    label=f'Optimal Threshold: {optimal_threshold}')
    axs[0, 0].set_xlabel('Threshold')
    axs[0, 0].set_ylabel('Score')
    axs[0, 0].set_title('Precision, Recall, F1 Score vs Threshold')
    axs[0, 0].grid(True)
    axs[0, 0].legend()

    # Plot 2: Score Distribution
    axs[0, 1].hist(scores[scores['true_match'] == True]['score'], bins=20, alpha=0.5, label='True Matches')
    axs[0, 1].hist(scores[scores['true_match'] == False]['score'], bins=20, alpha=0.5, label='Non-Matches')
    axs[0, 1].axvline(x=optimal_threshold, color='k', linestyle='--',
                     label=f'Optimal Threshold: {optimal_threshold}')
    axs[0, 1].set_xlabel('Fuzzy Match Score')
    axs[0, 1].set_ylabel('Frequency')
    axs[0, 1].set_title('Distribution of Fuzzy Match Scores')
    axs[0, 1].legend()

    # Plot 3: Confusion Matrix at Optimal Threshold
    cm = confusion_matrix(scores['true_match'], scores[f'predicted_{optimal_threshold}'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axs[1, 0])
    axs[1, 0].set_xlabel('Predicted')
    axs[1, 0].set_ylabel('Actual')
    axs[1, 0].set_title(f'Confusion Matrix at Threshold {optimal_threshold}')
    axs[1, 0].set_xticklabels(['Non-Match', 'Match'])
    axs[1, 0].set_yticklabels(['Non-Match', 'Match'])

    # Plot 4: ROC Curve
    fpr, tpr, _ = roc_curve(scores['true_match'], scores['score'])
    roc_auc = auc(fpr, tpr)
    axs[1, 1].plot(fpr, tpr, 'b-', label=f'ROC (AUC = {roc_auc:.2f})')
    axs[1, 1].plot([0, 1], [0, 1], 'k--')
    axs[1, 1].set_xlabel('False Positive Rate')
    axs[1, 1].set_ylabel('True Positive Rate')
    axs[1, 1].set_title('ROC Curve')
    axs[1, 1].legend()
    axs[1, 1].grid(True)

    plt.tight_layout()
    plt.savefig('fuzzy_match_evaluation.png')
    plt.close()

    # Additional plot: Performance metrics across thresholds
    plt.figure(figsize=(10, 6))
    plt.plot(metrics.index, metrics['true_positives'], 'g-', label='True Positives')
    plt.plot(metrics.index, metrics['false_positives'], 'r-', label='False Positives')
    plt.plot(metrics.index, metrics['false_negatives'], 'b-', label='False Negatives')
    plt.axvline(x=optimal_threshold, color='k', linestyle='--',
               label=f'Optimal Threshold: {optimal_threshold}')
    plt.xlabel('Threshold')
    plt.ylabel('Count')
    plt.title('Classification Counts vs Threshold')
    plt.grid(True)
    plt.legend()
    plt.savefig('fuzzy_match_counts.png')
    plt.close()

    return "Visualizations created and saved."

# Load datasets
incident_df = pd.read_csv('incident_reports.csv')
social_df = pd.read_csv('social_media.csv')
telecom_df = pd.read_csv('telecom_logs.csv')

evaluation_results = evaluate_fuzzy_matching(social_df, telecom_df, incident_df)
visualize_evaluation(evaluation_results)


'Visualizations created and saved.'