In [1]:
import networkx as nx
import json
import pandas as pd
import plotly.express as px

In [14]:
def load_and_organize_communications(graph_path):
    """
    Loads a graph from a JSON file, organizes communication events, and enhances each event with detailed attributes of the source, target, and communication itself.

    Args:
        graph_path (str): Path to the JSON file containing the graph data.

    Returns:
        list: A list of dictionaries. Each dictionary represents a communication event
             with its basic information and enhanced attributes.
    """
    # Load the graph from the JSON file
    with open(graph_path) as f:
        json_data = json.load(f)

    G = nx.json_graph.node_link_graph(json_data, edges="edges")
    communications = []

    for node_id in G.nodes():
        node_data = G.nodes[node_id]

        # Check if the current node is a Communication event
        if node_data.get('type') == 'Event' and node_data.get('sub_type') == 'Communication':
            comm_data = {
                'id': node_id,
                'timestamp': node_data.get('timestamp'),
                'content': node_data.get('content', ''),
                # Add basic source and target information
                'source': None,
                'target': None,
                # Attributes
                'source_attrs': {},
                'target_attrs': {}
            }

            # Extract source entity attributes from incoming edges
            for predecessor in G.predecessors(node_id):
                pred_node = G.nodes[predecessor]
                if pred_node.get('type') == 'Entity':
                    comm_data['source'] = pred_node.get('label', '')
                    comm_data['source_attrs'] = pred_node.copy()
                    break  # Assuming only one source entity per communication

            # Extract target entity attributes from outgoing edges
            for successor in G.successors(node_id):
                succ_node = G.nodes[successor]
                if succ_node.get('type') == 'Entity':
                    comm_data['target'] = succ_node.get('label', '')
                    comm_data['target_attrs'] = succ_node.copy()
                    break  # Assuming only one target entity per communication

            communications.append(comm_data)

    return communications

# Example usage:
graph_path = "../data/MC3_graph.json"
communications_data = load_and_organize_communications(graph_path)
print(communications_data[0])

{'id': 'Event_Communication_1', 'timestamp': '2040-10-01 08:09:00', 'content': "Hey The Intern, it's The Lookout! Just spotted a pod of dolphins near the eastern point this morning. They were so playful! If you're free this weekend, the migratory birds are starting to arrive too. Let me know if you want to join for some birdwatching!", 'source': 'The Lookout', 'target': 'The Intern', 'source_attrs': {'type': 'Entity', 'label': 'The Lookout', 'name': 'The Lookout', 'sub_type': 'Person'}, 'target_attrs': {'type': 'Entity', 'label': 'The Intern', 'name': 'The Intern', 'sub_type': 'Person'}}


In [11]:
df = pd.DataFrame(communications_data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

df['time_window'] = df['timestamp'].dt.floor('H')
aggregated_df = df.groupby(['source', 'time_window']).size().reset_index(name='count')

aggregated_df = df.groupby(['source', 'time_window']).size().reset_index(name='count')
pivot_df = aggregated_df.pivot(index='time_window', columns='source', values='count').fillna(0)

fig = px.line(pivot_df, x=pivot_df.index, y=pivot_df.columns,
                labels={'x': 'Time', 'value': 'Number of Communications'},
                title='Communications per Source over Time (1h)')

fig.update_layout(
    xaxis=dict(showgrid=True, gridwidth=1),
    yaxis=dict(showgrid=True, gridwidth=1),
    font=dict(size=12)
)

fig.show()


'H' is deprecated and will be removed in a future version, please use 'h' instead.



In [12]:
df = pd.DataFrame(communications_data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract the hour of day for each timestamp
df['hour_of_day'] = df['timestamp'].dt.hour

# Group by source and hour of day, then count occurrences
aggregated_df = df.groupby(['source', 'hour_of_day']).size().reset_index(name='count')

# Pivot the DataFrame to have hours as index and sources as columns
pivot_df = aggregated_df.pivot(index='hour_of_day', columns='source', values='count').fillna(0)

# Create a line plot using Plotly Express
fig = px.line(pivot_df, x=pivot_df.index, y=pivot_df.columns,
                labels={'x': 'Hour of Day', 'value': 'Number of Communications'},
                title='Communications per Source by Hour of Day (1H Windows)')

# Update the layout for better readability
fig.update_layout(
    xaxis=dict(showgrid=True, gridwidth=1),
    yaxis=dict(showgrid=True, gridwidth=1),
    font=dict(size=12)
)

# Display the plot
fig.show()

In [None]:
df = pd.DataFrame(communications_data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract date and hour of day for each timestamp
df['date'] = df['timestamp'].dt.date
df['hour_of_day'] = df['timestamp'].dt.hour

# Group by date, source, and hour of day, then count occurrences
aggregated_df = df.groupby(['date', 'source', 'hour_of_day']).size().reset_index(name='count')

# Create a line plot using Plotly Express with facet rows for each date
fig = px.line(aggregated_df, x='hour_of_day', y='count', color='source',
                facet_row='date',
                labels={'hour_of_day': 'Hour of Day', 'count': 'Number of Communications'},
                title='Daily Communication Patterns by Source (1H Windows)')

# Update the layout for better readability
fig.update_layout(
    height=1000,  # Adjust height to accommodate multiple rows
    width=800,
    font=dict(size=12)
)

# Show the plot
fig.show()

In [20]:
from plotly.validators.scatter.marker import SymbolValidator

# Create DataFrame from the sample data
df = pd.DataFrame(communications_data)

# Extract sub_type from source_attrs
df['sub_type'] = df['source_attrs'].apply(lambda x: x.get('sub_type', 'Unknown'))

# Process timestamp to extract date, month_day, and hour_of_day
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date
df['hour_of_day'] = df['timestamp'].dt.hour
df['month_day'] = df['date'].apply(lambda x: f"{x.month:02d}-{x.day:02d}")

# Generate available symbols
validator = SymbolValidator()
raw_symbols = validator.values

namestems = []
namevariants = []
symbols_list = []

for i in range(0, len(raw_symbols), 3):
    if i + 2 < len(raw_symbols):
        name = raw_symbols[i + 2]
        symbols_list.append(raw_symbols[i])
        namestems.append(name.replace("-open", "").replace("-dot", ""))
        namevariants.append(name[len(namestems[-1]):])

# Aggregate data
aggregated_df = df.groupby(['date', 'month_day', 'source', 'sub_type', 'hour_of_day']).size().reset_index(name='count')

# Assign symbols to sources within each subtype
unique_subtypes = aggregated_df['sub_type'].unique()
unique_sources = aggregated_df['source'].unique()

source_symbol_map = {}
for st in unique_subtypes:
    sources_in_st = aggregated_df[aggregated_df['sub_type'] == st]['source'].unique()
    for i, source in enumerate(sources_in_st):
        symbol_index = i % len(symbols_list)
        source_symbol_map[(st, source)] = symbols_list[symbol_index]

# Map symbols to the DataFrame
aggregated_df['symbol'] = aggregated_df.apply(lambda row: source_symbol_map.get((row['sub_type'], row['source']), 'circle'), axis=1)

# Create scatter plot with custom symbols and colors
subtype_colors = {subtype: color for subtype, color in zip(unique_subtypes, px.colors.qualitative.Alphabet[:len(unique_subtypes)])}

fig = px.scatter(aggregated_df, x='hour_of_day', y='count',
                    facet_row='month_day',
                    color='sub_type',
                    labels={'hour_of_day': 'Hour of Day',
                        'count': 'Number of Communications',
                        'sub_type': 'Source Type'},
                    title='Daily Communication Patterns by Source Type and Entity',
                    opacity=0.8,
                    color_discrete_map=subtype_colors)

# Update marker symbols for each source
for name, group in aggregated_df.groupby('source'):
    fig.update_traces(marker=dict(symbol=group['symbol'].iloc[0]))

# Customize layout and appearance
fig.update_layout(
    showlegend=True,
    margin=dict(l=40, r=40, t=100, b=40),
    height=800,
    width=600
)

# Remove y-axis grid lines
fig.update_yaxes(showgrid=False)

# Show the plot
fig.show()