# Step1: Dataset Preparation
Use the graph dataset and load it in readable format in memory

In [126]:
import os
import networkx as nx
import pandas as pd

# Path to extracted files
graphml_dir = 'input/exported_aws_graphs'

# List all graph files
graph_files = [os.path.join(graphml_dir, f) for f in os.listdir(graphml_dir) if f.endswith('.graphml')]

# Initialize data storage
architectures = []

################################ For debugging and checking Graph Data #################################

# Inspect node attributes dynamically
# for graph_file in graph_files[:1]:  # Check just one file for debugging
#     graph = nx.read_graphml(graph_file)
#     for node in graph.nodes:
#         print(f"Node ID: {node}, Attributes: {graph.nodes[node]}")

# Inspect edge attributes dynamically
# for graph_file in graph_files[:1]:  # Check just one file for debugging
#     graph = nx.read_graphml(graph_file)
#     for edge in graph.edges(data=True):
#         print(f"Edge: {edge}")

# Inspect graph attributes dynamically
# for graph_file in graph_files[:1]:  # Check just one file for debugging
#     graph = nx.read_graphml(graph_file)
#     print("Graph Metadata:", graph.graph)

##########################################################################################################

# Process each graph file
for graph_file in graph_files:
    graph = nx.read_graphml(graph_file)
    architecture_id = os.path.basename(graph_file).split('.')[0]

    # Extract node details
    nodes = {
        node: {
            "service": graph.nodes[node].get('service', 'NULL'),
            "human_name": graph.nodes[node].get('human_name', 'NULL'),
            "info": graph.nodes[node].get('info', '')
        }
        for node in graph.nodes
    }

    
    # Map nodes to services
    services = [data['service'] for data in nodes.values()]
    
    # Extract edges with meaningful details
    edges = [
        {
            "source": nodes[src]["service"],
            "target": nodes[dst]["service"],
            "type": edge_data.get('type', 'NULL')
        }
        for src, dst, edge_data in graph.edges(data=True)  # Unpack the edge attributes
    ]
    
    # Add processed architecture
    architectures.append({
        "id": architecture_id,
        "services": services,
        "edges": edges,
        "name": graph.graph.get('name', 'Unnamed Architecture'),
        "link": graph.graph.get('link', ''),
        "categories": graph.graph.get('categories', ''),
        "notes": graph.graph.get('notes', '')
    })

# Convert to DataFrame
architecture_df = pd.DataFrame(architectures)

# Display the processed data
print(architecture_df.head())

# Save to CSV for further analysis
architecture_df.to_csv('output/architectures.csv', index=False)


            id                                           services  \
0  WS2Qgx0qgCM       [EC2, EC2, CloudTrail, OpenSearch, STS, IAM]   
1  AzM_d7ZvzUE  [Lambda, Lambda, Lambda, Kinesis, Kinesis, Aur...   
2  6LcSv9XocTY  [UserConsumerWeb, Connect, S3, S3, Transcribe,...   
3  3yJZ6rPoZfg    [EKS, EKS, VPC, SQS, EC2, S3, UserConsumerEdge]   
4  JRDGId6N49E  [ThirdParty, EC2, S3, S3, CloudFormation, Serv...   

                                               edges  \
0  [{'source': 'EC2', 'target': 'EC2', 'type': 'd...   
1  [{'source': 'Lambda', 'target': 'Kinesis', 'ty...   
2  [{'source': 'UserConsumerWeb', 'target': 'Conn...   
3  [{'source': 'EKS', 'target': 'S3', 'type': 'da...   
4  [{'source': 'ThirdParty', 'target': 'EC2', 'ty...   

                                                name  \
0  Airbnb: Securing MultiTenant Kubernetes Cluste...   
1  Epsagon: Automatically Tracing and Analyzing B...   
2  Intuit: Serving 7 Million Customers Using Amaz...   
3  Hexagon HxDR: Cloud-B

# Step2: Find all the unique AWS services among all the architectures
    - Create a list of all the unique services as all_services and keep appending all the unique services to it

In [127]:
all_services = []
for i in range(len(architectures)):
    for service in architectures[i]['services']:
        if service not in all_services:
            all_services.append(service)

# Remove entries which contains "User" or "ThirdParty" in all_services
# This is done to remove the user created services from the list
all_services = [service for service in all_services if "User" not in service]
all_services = [service for service in all_services if "ThirdParty" not in service]

all_services = sorted(all_services)

# Save all_services to a csv file
all_services_df = pd.DataFrame(all_services)
all_services_df.to_csv('output/aws_services.csv', index=False)

In [128]:
# Define the list of services (that are not AWS services) to be removed
extra_services = {'CouchBase', 'MongoDBAtlas', 'OnPremDC', 'SAP', 'ServiceNow'}

# Remove the extra services
all_services = [service for service in all_services if service not in extra_services]

# Save the cleaned list of services to a CSV file
all_services_df = pd.DataFrame(all_services)
all_services_df.to_csv('output/aws_services.csv', index=False)

In [129]:
architectures_with_extra_services = []

# Process each graph file
for graph_file in graph_files:
    graph = nx.read_graphml(graph_file)
    architecture_id = os.path.basename(graph_file).split('.')[0]

    # Extract node details
    nodes = {
        node: {
            "service": graph.nodes[node].get('service', 'NULL'),
            "human_name": graph.nodes[node].get('human_name', 'NULL'),
            "info": graph.nodes[node].get('info', '')
        }
        for node in graph.nodes
    }

    # Map nodes to services
    services = [data['service'] for data in nodes.values()]
    
    # Check if any extra service is present in this architecture
    if any(service in extra_services for service in services):
        # Add architecture information to the list
        architectures_with_extra_services.append({
            "id": architecture_id,
            "name": graph.graph.get('name', 'Unnamed Architecture'),
            "extra_services": [service for service in services if service in extra_services]
        })

# Convert the results to a DataFrame
arch_df = pd.DataFrame(architectures_with_extra_services)

# Print the DataFrame with architectures containing extra services
print(arch_df)

            id                                               name  \
0  QnwfcDZkwh8  Mediaset: Achieving an Omni-Channel Broadcasti...   
1  -3lnf5lzsH0           MakeMyTrip: Building Next Generation SOC   
2  gD8pzUnXgsU  Liberty Mutual: Moving Towards RealTime Financ...   
3  CsD5bmM6mpY  Linke - a Syntax Company: Building a Business ...   
4  HcmEFZukA-Y  7-Eleven: Innovation with Serverless for Cash-...   
5  9LhiUsg3knw  Musixmatch: How Content is Generated by Users ...   
6  wtl7CrSQnHA  Fresenius Medical Care: Enabling Patient Care ...   
7  AS2JeM2FUzE  Intellect Design Arena: Insurance Risk Assessm...   
8  XmIhHtPJWog  HDI Group: Automated Security and Compliance I...   
9  iwZLadQ3XpY  Riot Games: A new data ingest pipeline for lea...   

   extra_services  
0      [OnPremDC]  
1      [OnPremDC]  
2      [OnPremDC]  
3           [SAP]  
4  [MongoDBAtlas]  
5     [CouchBase]  
6      [OnPremDC]  
7  [MongoDBAtlas]  
8    [ServiceNow]  
9      [OnPremDC]  


## Usage Count of each unique AWS service across all the cloud architectures

In [130]:
# Initialize a dictionary to store the usage count for each service
service_usage_count = {service: 0 for service in all_services}

# Calculate the usage count of each service across all architectures
for architecture in architectures:
    for service in set(architecture['services']):
        if service in service_usage_count:
            service_usage_count[service] += 1

# Convert the usage count dictionary to a DataFrame
service_usage_df = pd.DataFrame(
    list(service_usage_count.items()), 
    columns=["Service", "Usage Count"]
)

# Calculate the percentage of architectures for each service
service_usage_df["Percentage"] = (
    service_usage_df["Usage Count"] / len(architectures) * 100
).round(2)  # Round to 2 decimal places for clarity

# Sort the DataFrame by usage count in descending order
service_usage_df = service_usage_df.sort_values(by="Usage Count", ascending=False)

# Save the DataFrame to a CSV file
service_usage_df.to_csv('output/aws_service_usage.csv', index=False)

# Display the top entries of the DataFrame
print(service_usage_df)

                  Service  Usage Count  Percentage
106                    S3          253       63.89
75                 Lambda          220       55.56
44                    EC2          151       38.13
41               DynamoDB          123       31.06
9              ApiGateway           96       24.24
..                    ...          ...         ...
60                Grafana            1        0.25
53          ElementalLive            1        0.25
112  SageMakerGroundTruth            1        0.25
88          ModelRegistry            1        0.25
0                     ACM            1        0.25

[134 rows x 3 columns]


In [None]:
target_services = ['WAF', 'AWSConfig', 'Neptune', 'RoboMaker', 'StorageGateway', 'Inspector', 'Timestream', 'SES']

# Calculate the total number of architectures affected by the target services
affected_architectures = set()

for architecture in architectures:
    # Check if any of the target services are in the current architecture
    if any(service in target_services for service in architecture['services']):
        affected_architectures.add(architecture['id'])

affected_architectures_df = pd.DataFrame(affected_architectures)
print(len(affected_architectures))

40


## Architectures with S3 and Lambda

In [137]:
# List to store architectures containing both 'S3' and 'Lambda'
architectures_with_s3_lambda = []

for _, row in architecture_df.iterrows():
    # Check if both 'S3' and 'Lambda' are in the current architecture's services
    if 'S3' in row['services'] and 'Lambda' in row['services'] and 'DynamoDB' in row['services']:
        architectures_with_s3_lambda.append({
            "id": row['id'],
            "name": row['name'],
            "services": row['services']
        })

# Convert the architectures list to a DataFrame
architectures_with_s3_lambda_df = pd.DataFrame(architectures_with_s3_lambda)

# Add a column for the number of services in each architecture
architectures_with_s3_lambda_df['service_count'] = architectures_with_s3_lambda_df['services'].apply(len)

# Sort the DataFrame by the service count in descending order
architectures_with_s3_lambda_df = architectures_with_s3_lambda_df.sort_values(by='service_count', ascending=False)

# Display the sorted architectures with 'S3' and 'Lambda'
print(architectures_with_s3_lambda_df.head())

# Save to CSV for further analysis
architectures_with_s3_lambda_df.to_csv('output/architectures_with_s3_lambda.csv', index=False)

             id                                               name  \
48  dy-drIboyNA  Healthdirect Australia: Using AWS to Connect P...   
60  hFx0EF9KEZU  Buzzvil: Segment Processing and Provision for ...   
51  PoYiSKUy8sE  Alert Logic: Scaling Storage and Delivery of P...   
62  l0hlxVNmJPI  eCloudvalley: Build Up An EnterpriseLevel IoT ...   
7   F4KDOGNpSoI  Pernod Ricard: Shortening URLs on a Global Sca...   

                                             services  service_count  
48  [UserConsumerAPI, UserConsumerAPI, S3, S3, Api...             15  
60  [UserConsumerWebMobile, EKS, EKS, ThirdParty, ...             15  
51  [SQS, StepFunctions, ECS, ECS, Lambda, Lambda,...             14  
62  [UserConsumerMobile, UserConsumerEdge, IoTCore...             14  
7   [CloudFront, LambdaAtEdge, DynamoDB, S3, S3, L...             14  


# Step3: Load the compatibitliy matrix from the JSON file
    - Mapping AWS services to Azure and GCP counterparts
    - This needs to be researched and created manually
    - One good reference is : https://github.com/milanm/Cloud-Product-Mapping

# Example of service_mapping:
```python
service_mapping = {
    "S3": {"Azure": "Blob Storage", "GCP": "Cloud Storage", "Compatibility": "Fully Compatible"},
    "Lambda": {"Azure": "Functions", "GCP": "Cloud Functions", "Compatibility": "Fully Compatible"},
    "Athena": {"Azure": None, "GCP": None, "Compatibility": "Incompatible"},
    "DynamoDB": {"Azure": "Cosmos DB", "GCP": "Firestore", "Compatibility": "Partially Compatible"},
    # Add more mappings as needed
}
```

In [114]:
import json
with open('input/compatibility_matrix_2.json', 'r') as file:
    service_mapping = json.load(file)

# Convert mapping to DataFrame
mapping_df = pd.DataFrame.from_dict(service_mapping, orient='index')
mapping_df.index.name = "AWS"
print(mapping_df)

# Save mapping for later reference
mapping_df.to_csv('output/service_mapping.csv')


                                       Azure                    GCP  \
AWS                                                                   
ACM                 App Service Certificates    Certificate Manager   
ALB                      Application Gateway   Cloud Load Balancing   
AMI             Azure Virtual Machine Images          Custom Images   
AWSConfig                       Azure Policy  Cloud Asset Inventory   
AccessAnalyzer                 Azure Monitor       Cloud Audit Logs   
...                                      ...                    ...   
VPCPeering           Virtual Network Peering            VPC Peering   
VPN                              VPN Gateway              Cloud VPN   
WAF                 Web Application Firewall            Cloud Armor   
WorkSpaces             Azure Virtual Desktop                   None   
XRay                    Application Insights            Cloud Trace   

                       Compatibility  
AWS                                  

# Step4: Compute the lock-in index for all the architectures
It computes how many services in each architectures are incompatible or partially compatible

In [106]:
# Calcualte Lock-In Index
# Define weights
weights = {
    'Fully Compatible': 0,
    'Partially Compatible': 0.5,
    'Incompatible': 1
}

# Merge architectures with service mapping
# def compute_lock_in_index(services):
#     proprietary_count = sum(1 for s in services if service_mapping.get(s, {}).get("Compatibility") != "Fully Compatible")
#     return proprietary_count / len(services) if services else 0


# Function to calculate Lock-In Index
def compute_lock_in_index(services):
    total_weight = 0
    for service in services:
        compatibility = service_mapping.get(service, {}).get('Compatibility', '')
        total_weight += weights.get(compatibility, 0)
    return total_weight / len(services) if services else 0

# # Add Lock-In Index for each architecture
architecture_df['lock_in_index'] = architecture_df['services'].apply(compute_lock_in_index)

# Categorize architectures by lock-in level
def categorize_lock_in(index):
    if index >= 0.07:
        return "High Lock-In"
    elif index >= 0.04:
        return "Moderate Lock-In"
    else:
        return "Low Lock-In"

architecture_df['lock_in_category'] = architecture_df['lock_in_index'].apply(categorize_lock_in)
print(architecture_df[['id', 'lock_in_index', 'lock_in_category']])

# Save updated dataset
architecture_df.to_csv('output/lock_in_analysis.csv', index=False)


              id  lock_in_index  lock_in_category
0    WS2Qgx0qgCM       0.083333      High Lock-In
1    AzM_d7ZvzUE       0.000000       Low Lock-In
2    6LcSv9XocTY       0.045455  Moderate Lock-In
3    3yJZ6rPoZfg       0.000000       Low Lock-In
4    JRDGId6N49E       0.000000       Low Lock-In
..           ...            ...               ...
391  ItpY_KKR94k       0.062500  Moderate Lock-In
392  NfUwtK8ALtw       0.000000       Low Lock-In
393  0wnNlOg42dc       0.000000       Low Lock-In
394  chQ1phTqvnY       0.000000       Low Lock-In
395  aOZ4H98XROc       0.000000       Low Lock-In

[396 rows x 3 columns]


# Step5: Further Analysis:
    - Aanalyze which services contributes most to the lock-in in different architectures.
    - Assess migration feasibility
    - Visualization

In [107]:
# Identify most frequent proprietary AWS services
service_counts = pd.Series([s for services in architecture_df['services'] for s in services]).value_counts()
critical_services = service_counts[service_counts.index.isin([s for s, v in service_mapping.items() if v["Compatibility"] != "Fully Compatible"])]
print("Critical Proprietary Services:")
print(critical_services)

# Analyze impact on architectures
critical_architectures = architecture_df[architecture_df['services'].apply(lambda x: any(s in critical_services.index for s in x))]
print("Architectures Affected by Critical Services:")
print(critical_architectures[['id', 'services']])


Critical Proprietary Services:
OpenSearch           39
LambdaAtEdge         11
AppSync              10
ControlTower          8
Connect               7
AWSConfig             7
Neptune               7
SystemsManager        6
MediaPackage          4
SES                   4
MediaLive             3
Chime                 2
GlobalAccelerator     2
ElementalLive         2
WorkSpaces            2
StorageGateway        2
AlexaForBusiness      2
Timestream            2
MediaConnect          2
RoboMaker             1
MAM                   1
CouchBase             1
AccessAnalyzer        1
DeepLens              1
DataExchange          1
Grafana               1
KinesisVideo          1
AppStream             1
Name: count, dtype: int64
Architectures Affected by Critical Services:
              id                                           services
0    WS2Qgx0qgCM       [EC2, EC2, CloudTrail, OpenSearch, STS, IAM]
2    6LcSv9XocTY  [UserConsumerWeb, Connect, S3, S3, Transcribe,...
9    ww5fiygF6eg  [Use

In [108]:
# Evaluate ease of migration for each architecture
def assess_migration(services):
    incompatible_services = [s for s in services if service_mapping.get(s, {}).get("Compatibility") == "Incompatible"]
    if incompatible_services:
        return f"Re-engineer ({len(incompatible_services)} services)"
    return "Replace or Retain"

architecture_df['migration_strategy'] = architecture_df['services'].apply(assess_migration)
print(architecture_df[['id', 'migration_strategy']])

# Save migration analysis
architecture_df.to_csv('output/migration_analysis.csv', index=False)


              id migration_strategy
0    WS2Qgx0qgCM  Replace or Retain
1    AzM_d7ZvzUE  Replace or Retain
2    6LcSv9XocTY  Replace or Retain
3    3yJZ6rPoZfg  Replace or Retain
4    JRDGId6N49E  Replace or Retain
..           ...                ...
391  ItpY_KKR94k  Replace or Retain
392  NfUwtK8ALtw  Replace or Retain
393  0wnNlOg42dc  Replace or Retain
394  chQ1phTqvnY  Replace or Retain
395  aOZ4H98XROc  Replace or Retain

[396 rows x 2 columns]
