# Exploring SSIS Structure in Memgraph

This notebook focuses on understanding how SSIS components are represented in the graph and how to navigate the SSIS Northwind structure.

## What We'll Learn
- SSIS package structure in graph format
- How operations, tables, and connections are modeled
- Data flow analysis through the graph
- Package dependencies and execution order

In [None]:
# Setup - same as notebook 01
import mgclient
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Connect to Memgraph
mg = mgclient.connect(host='localhost', port=7687, username='', password='')
print("✅ Connected to Memgraph")

def execute_query(query, description=None):
    """Execute a Cypher query and return results as DataFrame."""
    if description:
        print(f"\n🔍 {description}")
        print(f"Query: {query}")
        print("-" * 50)
    
    cursor = mg.cursor()
    cursor.execute(query)
    results = cursor.fetchall()
    
    if results:
        columns = [desc.name for desc in cursor.description] if cursor.description else ['result']
        df = pd.DataFrame(results, columns=columns)
        print(f"Found {len(df)} results")
        return df
    else:
        print("No results found.")
        return pd.DataFrame()

## Part 1: SSIS Package Analysis

In [None]:
# Get detailed information about SSIS packages
packages_df = execute_query(
    """MATCH (p:Node {node_type: 'pipeline'}) 
       RETURN p.name as package_name, 
              p.id as package_id,
              p.properties as properties
       ORDER BY p.name""",
    "Detailed SSIS package information"
)

print(f"\n📦 Found {len(packages_df)} SSIS packages:")
for _, pkg in packages_df.iterrows():
    print(f"  • {pkg['package_name']}")
    
display(packages_df[['package_name', 'package_id']].head(10))

In [None]:
# Look at package properties to understand what metadata is available
if not packages_df.empty:
    sample_package = packages_df.iloc[0]
    print(f"\n🔍 Sample Package Properties: {sample_package['package_name']}")
    print("=" * 60)
    
    try:
        properties = json.loads(sample_package['properties']) if isinstance(sample_package['properties'], str) else sample_package['properties']
        for key, value in properties.items():
            if isinstance(value, str) and len(value) > 100:
                print(f"{key}: {value[:100]}...")
            else:
                print(f"{key}: {value}")
    except Exception as e:
        print(f"Could not parse properties: {e}")
        print(f"Raw properties: {sample_package['properties']}")

## Part 2: Operations Within Packages

In [None]:
# Find operations and which packages they belong to
operations_df = execute_query(
    """MATCH (pkg:Node {node_type: 'pipeline'})-[:CONTAINS]->(op:Node {node_type: 'operation'})
       RETURN pkg.name as package_name, 
              op.name as operation_name,
              op.id as operation_id
       ORDER BY pkg.name, op.name""",
    "Operations within SSIS packages"
)

if not operations_df.empty:
    print(f"\n⚙️ Operations by Package:")
    for package in operations_df['package_name'].unique():
        ops = operations_df[operations_df['package_name'] == package]
        print(f"\n📦 {package} ({len(ops)} operations):")
        for _, op in ops.head(5).iterrows():  # Show first 5 operations
            print(f"  • {op['operation_name']}")
        if len(ops) > 5:
            print(f"  ... and {len(ops) - 5} more operations")

display(operations_df.head(15))

In [None]:
# Create a visualization of packages and their operation counts
if not operations_df.empty:
    ops_per_package = operations_df.groupby('package_name').size().reset_index(name='operation_count')
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=ops_per_package, x='operation_count', y='package_name')
    plt.title('Number of Operations per SSIS Package')
    plt.xlabel('Number of Operations')
    plt.ylabel('Package Name')
    plt.tight_layout()
    plt.show()
    
    print(f"\n📊 Operation Count Statistics:")
    print(f"Total Operations: {len(operations_df)}")
    print(f"Average Operations per Package: {ops_per_package['operation_count'].mean():.1f}")
    print(f"Package with Most Operations: {ops_per_package.loc[ops_per_package['operation_count'].idxmax(), 'package_name']} ({ops_per_package['operation_count'].max()} ops)")

## Part 3: Data Assets and Tables

In [None]:
# Explore tables and data assets
tables_df = execute_query(
    """MATCH (t:Node {node_type: 'table'})
       RETURN t.name as table_name, 
              t.id as table_id,
              t.properties as properties
       ORDER BY t.name""",
    "Data assets and tables in the SSIS graph"
)

print(f"\n🗃️ Found {len(tables_df)} tables/data assets:")
if not tables_df.empty:
    for _, table in tables_df.head(10).iterrows():
        print(f"  • {table['table_name']}")
    if len(tables_df) > 10:
        print(f"  ... and {len(tables_df) - 10} more tables")

display(tables_df[['table_name', 'table_id']].head(10))

## Part 4: Data Flow Analysis

In [None]:
# Analyze data flows (operations reading from or writing to tables)
data_flows_df = execute_query(
    """MATCH (op:Node {node_type: 'operation'})-[r:READS_FROM|WRITES_TO]->(table:Node {node_type: 'table'})
       RETURN op.name as operation_name,
              type(r) as flow_type,
              table.name as table_name
       ORDER BY table.name, op.name""",
    "Data flows between operations and tables"
)

if not data_flows_df.empty:
    print(f"\n💾 Data Flow Summary:")
    flow_summary = data_flows_df['flow_type'].value_counts()
    for flow_type, count in flow_summary.items():
        print(f"  • {flow_type}: {count} connections")
    
    print(f"\n🔄 Sample Data Flows:")
    for _, flow in data_flows_df.head(10).iterrows():
        print(f"  • {flow['operation_name']} --[{flow['flow_type']}]--> {flow['table_name']}")

display(data_flows_df.head(15))

In [None]:
# Find tables that are used by multiple operations (shared resources)
shared_tables_df = execute_query(
    """MATCH (op:Node {node_type: 'operation'})-[r:READS_FROM|WRITES_TO]->(table:Node {node_type: 'table'})
       WITH table, collect(DISTINCT op.name) as operations, count(DISTINCT op) as operation_count
       WHERE operation_count > 1
       RETURN table.name as table_name,
              operation_count,
              operations
       ORDER BY operation_count DESC""",
    "Tables shared by multiple operations (potential bottlenecks)"
)

if not shared_tables_df.empty:
    print(f"\n⚠️ Shared Tables (Potential Bottlenecks):")
    for _, table in shared_tables_df.iterrows():
        print(f"  • {table['table_name']}: used by {table['operation_count']} operations")
        operations_list = table['operations'][:3]  # Show first 3 operations
        print(f"    Operations: {', '.join(operations_list)}{'...' if len(table['operations']) > 3 else ''}")
        print()

display(shared_tables_df)

## Part 5: Package Dependencies

In [None]:
# Look for cross-package dependencies
cross_package_deps_df = execute_query(
    """MATCH (pkg1:Node {node_type: 'pipeline'})-[r:DEPENDS_ON]->(pkg2:Node {node_type: 'pipeline'})
       RETURN pkg1.name as dependent_package,
              pkg2.name as dependency_package,
              type(r) as relationship_type
       ORDER BY pkg1.name""",
    "Cross-package dependencies"
)

if not cross_package_deps_df.empty:
    print(f"\n🔗 Cross-Package Dependencies:")
    for _, dep in cross_package_deps_df.iterrows():
        print(f"  • {dep['dependent_package']} depends on {dep['dependency_package']}")
    
    display(cross_package_deps_df)
else:
    print("\n🔗 No direct cross-package dependencies found.")
    print("This could mean packages are independent or dependencies are implicit through shared resources.")

In [None]:
# Alternative: Look for implicit dependencies through shared tables
implicit_deps_df = execute_query(
    """MATCH (pkg1:Node {node_type: 'pipeline'})-[:CONTAINS]->(op1:Node {node_type: 'operation'})-[:WRITES_TO]->(table:Node {node_type: 'table'})<-[:READS_FROM]-(op2:Node {node_type: 'operation'})<-[:CONTAINS]-(pkg2:Node {node_type: 'pipeline'})
       WHERE pkg1.name <> pkg2.name
       RETURN DISTINCT pkg1.name as writer_package,
              pkg2.name as reader_package,
              table.name as shared_table
       ORDER BY shared_table, writer_package""",
    "Implicit dependencies through shared tables"
)

if not implicit_deps_df.empty:
    print(f"\n🔄 Implicit Dependencies (via shared tables):")
    for _, dep in implicit_deps_df.head(10).iterrows():
        print(f"  • {dep['writer_package']} writes to {dep['shared_table']}, read by {dep['reader_package']}")
    
    display(implicit_deps_df.head(10))
else:
    print("\n🔄 No implicit dependencies through shared tables found.")

## Part 6: Connection Analysis

In [None]:
# Analyze database connections
connections_df = execute_query(
    """MATCH (c:Node {node_type: 'connection'})
       RETURN c.name as connection_name,
              c.id as connection_id,
              c.properties as properties
       ORDER BY c.name""",
    "Database connections in the SSIS graph"
)

print(f"\n🔌 Found {len(connections_df)} connections:")
if not connections_df.empty:
    for _, conn in connections_df.iterrows():
        print(f"  • {conn['connection_name']}")
    
    display(connections_df[['connection_name', 'connection_id']])
else:
    print("No explicit connection nodes found.")
    print("Connections might be embedded in operation properties.")

In [None]:
# Find operations that use connections
op_connections_df = execute_query(
    """MATCH (op:Node {node_type: 'operation'})-[r:USES_CONNECTION]->(conn:Node {node_type: 'connection'})
       RETURN op.name as operation_name,
              conn.name as connection_name,
              type(r) as relationship_type
       ORDER BY conn.name, op.name""",
    "Operations using database connections"
)

if not op_connections_df.empty:
    print(f"\n🔗 Operation-Connection Relationships:")
    conn_usage = op_connections_df.groupby('connection_name').size().reset_index(name='usage_count')
    
    for _, usage in conn_usage.iterrows():
        print(f"  • {usage['connection_name']}: used by {usage['usage_count']} operations")
    
    display(op_connections_df.head(10))
else:
    print("\n🔗 No explicit operation-connection relationships found.")
    print("This is normal if connections are embedded within operations.")

## Part 7: Graph Structure Summary

In [None]:
# Create a comprehensive summary of the SSIS graph structure
print("📊 SSIS Northwind Graph Structure Summary")
print("=" * 50)

# Node counts by type
node_summary_df = execute_query(
    "MATCH (n) RETURN n.node_type as node_type, count(n) as count ORDER BY count DESC"
)

print("\n📦 Node Types:")
total_nodes = 0
for _, row in node_summary_df.iterrows():
    if row['node_type'] not in ['materialized_view', 'graph_metadata']:  # Exclude analytics nodes
        print(f"  • {row['node_type']}: {row['count']} nodes")
        total_nodes += row['count']

# Relationship counts
rel_summary_df = execute_query(
    "MATCH ()-[r]->() RETURN type(r) as relationship_type, count(r) as count ORDER BY count DESC"
)

print("\n🔗 Relationship Types:")
total_relationships = 0
for _, row in rel_summary_df.iterrows():
    print(f"  • {row['relationship_type']}: {row['count']} relationships")
    total_relationships += row['count']

print(f"\n📈 Totals:")
print(f"  • Total SSIS Nodes: {total_nodes}")
print(f"  • Total Relationships: {total_relationships}")
print(f"  • Graph Density: {total_relationships / (total_nodes * (total_nodes - 1)):.4f}" if total_nodes > 1 else "  • Graph Density: N/A")

# Create visualizations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Node types pie chart
ssis_nodes = node_summary_df[~node_summary_df['node_type'].isin(['materialized_view', 'graph_metadata'])]
ax1.pie(ssis_nodes['count'], labels=ssis_nodes['node_type'], autopct='%1.1f%%', startangle=90)
ax1.set_title('SSIS Node Types Distribution')

# Relationship types bar chart
sns.barplot(data=rel_summary_df, x='count', y='relationship_type', ax=ax2)
ax2.set_title('Relationship Types Count')
ax2.set_xlabel('Number of Relationships')

plt.tight_layout()
plt.show()

## Next Steps

Now that you understand the SSIS graph structure, you can:

1. **Explore Analytics-Ready Features** - Open `03_analytics_ready_features.ipynb` to learn about materialized views
2. **Advanced Queries** - Open `04_advanced_queries.ipynb` for complex analysis patterns
3. **Migration Analysis** - Open `05_migration_analysis.ipynb` for practical migration scenarios

## Key Takeaways

- SSIS packages are represented as `pipeline` nodes
- Operations within packages are `operation` nodes connected via `CONTAINS` relationships
- Data flows are modeled as `READS_FROM` and `WRITES_TO` relationships
- Shared resources (tables used by multiple operations) can indicate potential bottlenecks
- The graph structure reflects the actual SSIS package architecture