In [None]:
# It can be a PDF or a CSV with sentences (e.g. "description" column)
DOCUMENT_PATH = "RFP_TEST.csv"

# CSV of the Cloud Requirements Ontology
ONTOLOGY_PATH = "/content/gdrive/Shareddrives/NextGen Cloud Service Broker/Service Specification/1 - Cloud Ontology Phase/Cloud Requirements Ontology - Ontology.csv"

# Where is located the model state to load ?
MODEL_PATH = './gdrive/MyDrive/TESI/COLAB/MODELS/BERT-UNCASED/42_16_3.pt'

# **Setup**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install transformers spacy pdfminer.six xlsxwriter reportlab

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xlsxwriter
  Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting reportlab
  Downloading reportlab-4.1.0-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xlsxwriter, reportlab, pdfminer.six
Successfully installed pdfminer.six-20231228 reportlab-4.1.0 xlsxwriter-3.1.9


In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, BertConfig
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Verify if CUDA (GPU support) is available in PyTorch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU disponibile: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
device

device(type='cpu')

In [None]:
target_list = ['compute', 'data handling', 'network', 'security & compliance', 'management & monitoring', 'cloud service essentials']

In [None]:
# Initize the tokenizer from Transformers library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Or Load from Drive
#tokenizer = BertTokenizer.from_pretrained('/content/gdrive/Shareddrives/NextGen Cloud Service Broker/Service Specification/3 - Machine Learning Phase/tokenizer_bert_base_uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

In [None]:
def load_model_state(model, saved_model_path, device):
    """
    Load the saved model state into the given model architecture.

    Args:
    model (torch.nn.Module): The model architecture into which the state will be loaded.
    saved_model_path (str): Path to the saved model state file.
    device (torch.device): The device (CPU or GPU) to load the model onto.

    Returns:
    torch.nn.Module: The model with loaded state.
    """
    # Ensure that the model is on the correct device
    model = model.to(device)

    # Load the saved state dictionary
    model.load_state_dict(torch.load(saved_model_path, map_location=device))

    return model

# Initialize BERTClass model
loaded_model = BERTClass()

# Load the saved model state into the model
loaded_model = load_model_state(loaded_model, MODEL_PATH, device)

# Set the model to evaluation mode, ready for inference (i.e., making predictions on new data)
loaded_model.eval()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
def test_model(sentence, model, tokenizer, max_len, device):
    # Tokenize the input sentence
    inputs = tokenizer.encode_plus(
        sentence, None, add_special_tokens=True, max_length=max_len,
        padding='max_length', return_token_type_ids=True, truncation=True,
        return_attention_mask=True, return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device, dtype=torch.long)
    mask = inputs['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = inputs['token_type_ids'].to(device, dtype=torch.long)

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(ids, mask, token_type_ids)
        print(outputs)
        predictions = torch.sigmoid(outputs).cpu().detach().numpy()
        print(predictions)
    return predictions.tolist()

In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# **1) Load Document File (CSV or PDF)**

## Load an existing CSV (with "description" column)..

In [None]:
# Load the CSV file
df = pd.read_csv(DOCUMENT_PATH)
df

Unnamed: 0,source,description
0,https://cispe.cloud/website_cispe/wp-content/u...,CISPE (Cloud Infrastructure Services Providers...
1,https://cispe.cloud/website_cispe/wp-content/u...,We represent cloud infrastructure service prov...
2,https://cispe.cloud/website_cispe/wp-content/u...,Our growing membership includes companies oper...
3,https://cispe.cloud/website_cispe/wp-content/u...,The association is open to companies provided ...
4,https://cispe.cloud/website_cispe/wp-content/u...,Our members deliver and maintain the essential...
...,...,...
168,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to track cost per appl...
169,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to set budgets and rel...
170,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to configure maintenan...
171,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to centralise logging ...


## ..or load a PDF to extract sentences and generate a Dataframe

https://ashutoshtripathi.com/2020/05/04/how-to-perform-sentence-segmentation-or-sentence-tokenization-using-spacy-nlp-series-part-5/

In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/5.6 MB[0m [31m4.9 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/5.6 MB[0m [31m29.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.6/5.6 MB[0m [31m55.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228


In [None]:
from pdfminer.high_level import extract_text

def extract_sentences_from_pdf(pdf_path):
    # Extract text from PDF
    text = extract_text(pdf_path)

    # looks for lines that start with one or more digits followed by a newline character and removes them (e.g. identifiers or headings)
    text = re.sub(r'^\d+\n', '', text, flags=re.MULTILINE)

    # Process the cleaned text with SpaCy
    doc = nlp(text)

    # Filtering out sentences that are shorter than a certain threshold (3 characters) and remove newline characters
    sentences = [sentence.text.replace('\n', ' ').strip() for sentence in doc.sents if len(sentence.text.strip()) > 3]

    return sentences

def sentences_to_dataframe(sentences):
  df = pd.DataFrame(sentences, columns=["description"])
  return df


sentences = extract_sentences_from_pdf(DOCUMENT_PATH)
df = sentences_to_dataframe(sentences)

df

# **2) BERT Multi-Label Sentence Classification**

## **Sentences Preprocessing**

In [None]:
def clean_sentence(sentence):
    sentence = sentence.strip()
    sentence = sentence.rstrip('.')
    sentence = sentence.strip()
    sentence = sentence.replace("bidder", "provider").replace("Bidder", "Provider").replace("BIDDER", "Provider")
    return sentence

df['description'] = df['description'].apply(clean_sentence)
df

In [None]:
frase = "Demonstrate a multi-region deployment of a web application including a globally replicated database"
a = test_model(frase, loaded_model, tokenizer, 300, device)

tensor([[-1.1232,  2.2536, -3.4835, -3.5699, -2.2899, -0.3228]])
[[0.24542286 0.904959   0.02978455 0.02738676 0.09196652 0.41999277]]


## **Get Predictions**

In [None]:
# prende in input un csv con la colonna "description" e ci mette le predizioni binarie del modello ML

# Threshold for binary classification
threshold = 0.20

# Convert probabilities to binary values
def to_binary(probability, threshold):
    return 1 if probability >= threshold else 0

# Dictionary to store results
binary_classification = {label: [] for label in target_list}
goal_column = []  # To store values for the "goal" column

# Apply model over each description
for description in df['description']:
    predictions = test_model(description, loaded_model, tokenizer, 300, device, target_list)

    # Get the first item of predictions if there's more than one
    predicted_probabilities = predictions[0]

    # Convert probabilities to binary values and store them
    at_least_one_category = False
    for label, prob in zip(target_list, predicted_probabilities):
        binary_value = to_binary(prob, threshold)
        binary_classification[label].append(binary_value)
        if binary_value == 1:
            at_least_one_category = True

    # Set value for "goal" column
    goal_column.append(1 if at_least_one_category else 0)

# New DataFrame for binary classification results
binary_df = pd.DataFrame(binary_classification)
binary_df['goal'] = goal_column  # Add the "goal" column


df = pd.concat([df, binary_df], axis=1)

# Save CSV
#df.to_csv(f'./predicted.csv', index=False)

df

Unnamed: 0,source,description,compute,data handling,network,security & compliance,management & monitoring,cloud service essentials,goal
0,https://cispe.cloud/website_cispe/wp-content/u...,CISPE (Cloud Infrastructure Services Providers...,0,0,0,0,0,0,0
1,https://cispe.cloud/website_cispe/wp-content/u...,We represent cloud infrastructure service prov...,0,0,0,0,0,0,0
2,https://cispe.cloud/website_cispe/wp-content/u...,Our growing membership includes companies oper...,0,0,0,0,0,0,0
3,https://cispe.cloud/website_cispe/wp-content/u...,The association is open to companies provided ...,0,0,0,1,0,0,1
4,https://cispe.cloud/website_cispe/wp-content/u...,Our members deliver and maintain the essential...,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
168,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to track cost per appl...,0,0,0,0,0,0,0
169,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to set budgets and rel...,0,0,0,0,1,0,1
170,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to configure maintenan...,0,0,0,0,0,0,0
171,https://cispe.cloud/website_cispe/wp-content/u...,Demonstrate the ability to centralise logging ...,0,0,0,0,0,0,0


## **Grouping sentences according to their combinations of categories**

In [None]:
# prende un csv compilato con i valori binari, e crea il dataframe per raggruppare le frasi in base alle loro combinazioni di categorie

# Dict for categories combinations groups
categories = [col for col in df.columns if col not in ['source', 'description', 'goal']]
category_combinations = {}

# Each sentence is assigned to a group
for _, row in df.iterrows():
    description = row['description']
    category_combination = tuple(category for category in categories if row[category] == 1)

    if category_combination not in category_combinations:
        category_combinations[category_combination] = []
    category_combinations[category_combination].append(description)

    data = []
for combination, descriptions in category_combinations.items():
    for description in descriptions:
        row = {'Categories': ', '.join(combination), 'Sentence': description}
        data.append(row)

# creation of Dataframe
df_requirements = pd.DataFrame(data)

# Saving in CSV
#csv_file_path = f'./combinations.csv'
#df_requirements.to_csv(csv_file_path, index=False)

df_requirements

Unnamed: 0,Categories,Sentence
0,,CISPE (Cloud Infrastructure Services Providers...
1,,We represent cloud infrastructure service prov...
2,,Our growing membership includes companies oper...
3,,Our members deliver and maintain the essential...
4,,The purpose of this Buying Cloud Services Hand...
...,...,...
168,"compute, data handling",Demonstrate the ability to host containerbased...
169,"compute, data handling, management & monitoring","Demonstrate the ability for the Object Store, ..."
170,"compute, data handling, cloud service essentials",Demonstrate the hosting of a simple 2-tier app...
171,"compute, data handling, cloud service essentials",Demonstrate a multi-region deployment of a web...


## **Separation between requirements and non-requirements**

In [None]:
# Dataframe with non-requirements
df_non_requirements = df_requirements[df_requirements['Categories'] == ''].reset_index(drop=True)

# Remove rows where 'Categories' column is an empty string
df_requirements = df_requirements[df_requirements['Categories'] != ''].reset_index(drop=True)


print(f"Non-Requirements: {len(df_non_requirements)}")
print(f"Requirements: {len(df_requirements)}")

Non-Requirements: 60
Requirements: 113


# **3) Extraction of Cloud Features**

## **Defining the function to normalize each sentence at this stage**

In [None]:
import string
from nltk.stem.porter import PorterStemmer


# Initialize NLTK's Porter stemmer
stemmer = PorterStemmer()

# Normalize a sentence with SpaCy and apply NLTK stemming
def normalize_sentence(sentence):
    # standardizing the form of the sentence (lowercase)
    sentence = sentence.lower()

    # Replace special chars with spaces
    sentence = sentence.replace('-', ' ')
    sentence = sentence.replace('/', ' ')

    # Remove all punctuation marks
    custom_punctuation = string.punctuation
    sentence = sentence.translate(str.maketrans('', '', custom_punctuation))

    # Remove extra spaces
    sentence = ' '.join(sentence.split())
    sentence = sentence.strip()

    # Tokenization and stemming
    doc = nlp(sentence)

    stemmed_sentence = [stemmer.stem(token.text) for token in doc]
    return ' '.join(stemmed_sentence)

In [None]:
a = normalize_sentence("load balancer")
a

'load balanc'

## **Load and save cloud ontology data**

In [None]:
# Load the CSV file
df_ontology = pd.read_csv(ONTOLOGY_PATH)

# Fulfill NaN values with previous value
df_ontology['Category'].fillna(method='ffill', inplace=True)
df_ontology['Subcategory'].fillna(method='ffill', inplace=True)


# The ontology is stored is a dictionary (keywords are stemmed). Another dict to associate each features to its product in AWS, Google, Azure
ontology_dict = {}
PROVIDERS = {}

for index, row in df_ontology.iterrows():
    category = row['Category'].lower().strip()
    subcategory = row['Subcategory'].strip()
    requirement = row['Requirement features'].strip()
    keywords = row['Keywords'].lower().strip()

    PROVIDERS[requirement] = (row["AWS"], row["Google"], row["Azure"])

    if category not in ontology_dict:
        ontology_dict[category] = {}

    if pd.notna(subcategory):
        if subcategory not in ontology_dict[category]:
            ontology_dict[category][subcategory] = {}

    if pd.notna(requirement):
        if requirement not in ontology_dict[category][subcategory]:
            ontology_dict[category][subcategory][requirement] = {}

    if pd.notna(keywords):
        stemmed_keywords = [normalize_sentence(kw).strip() for kw in keywords.split(', ')]
    else:
        stemmed_keywords = []

    ontology_dict[category][subcategory][requirement] = list(set(stemmed_keywords)) # stemming could have generate duplicates of base-versions

#ontology_dict['compute']
print(PROVIDERS)

{'Virtual Machine': ('EC2', 'Compute Engine', 'Virtual Machines'), 'Containerization': ('ECS, EKS', 'Kubernetes Engine', 'Kubernetes Service (AKS)'), 'Serverless': ('Lambda', 'Cloud Functions', 'Functions'), 'Dedicated Server': ('EC2 Dedicated Instances', 'Compute Engine sole-tenant node', 'Dedicated Host'), 'Scaling': ('EC2 Auto Scaling, EC2', 'Cloud AutoScaling, Compute Engine', 'Virtual Machine Scale Sets, Virtual Machines'), 'Load Balancing': ('ELB, EC2', 'Cloud Load Balancing, Compute Engine', 'Load Balancer, Virtual Machines'), 'Auto-Provisioning': ('CloudFormation', 'Deployment Manager', 'Resource Manager'), 'Windows': ('EC2 (Windows)', 'Compute Engine (Windows)', 'Virtual Machines (Windows)'), 'Linux': ('EC2 (Linux)', 'Compute Engine (Linux)', 'Virtual Machines (Linux)'), 'Processor': ('EC2 (Instance Types)', 'Compute Engine (Machine Types)', 'Virtual Machines (VM sizes)'), 'Memory': ('EC2 (Instance Types)', 'Compute Engine (Machine Types)', 'Virtual Machines (VM sizes)'), 'GPU

## **Applying keyword searching to each requirement to extract cloud features**

Only keywords contained in the macro categories predicted by the ML model are checked

In [None]:
def find_matching_features_and_keywords(categories, sentence, ontology):
    matching_features = []
    found_keywords = []

    sentence = normalize_sentence(sentence)

    if pd.notna(categories):
        for category in categories.split(','):
            category = category.strip()
            if category in ontology:
                for subcategory, requirements in ontology[category].items():
                    for requirement, kw_list in requirements.items():
                        for keyword in kw_list:
                            if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', sentence):
                                matching_features.append(requirement)
                                found_keywords.append(keyword)

    return list(set(matching_features)), list(set(found_keywords))

def find_keywords(row, ontology):
    _, keywords = find_matching_features_and_keywords(row['Categories'], row['Sentence'], ontology)
    return keywords

# Apply the functions to the DataFrame
df_requirements['Cloud Features'] = df_requirements.apply(lambda row: find_matching_features_and_keywords(row['Categories'], row['Sentence'], ontology_dict)[0], axis=1)
df_requirements['Keywords'] = df_requirements.apply(lambda row: find_keywords(row, ontology_dict), axis=1)


df_requirements

#df_requirements.to_csv(f'./keyword_extracted.csv', index=False)

Unnamed: 0,Categories,Sentence,Cloud Features,Keywords
0,security & compliance,The association is open to companies provided ...,"[Data Security, Compliance & Standards]","[data protect, code of conduct]"
1,security & compliance,The shared responsibility model is mostly used...,[Compliance & Standards],[complianc]
2,security & compliance,CISP should provide logically and geographical...,[],[]
3,security & compliance,"CISP should provide sufficient mechanisms, whi...",[Encryption],[encrypt]
4,security & compliance,The CISP must be certified ISO 27001,[Compliance & Standards],"[certifi, iso]"
...,...,...,...,...
108,"compute, data handling",Demonstrate the ability to host containerbased...,[Containerization],[containerbas]
109,"compute, data handling, management & monitoring","Demonstrate the ability for the Object Store, ...","[Scaling, APIs, NoSQL Database, Serverless]","[nosql, api, scale, faa, nosql databas]"
110,"compute, data handling, cloud service essentials",Demonstrate the hosting of a simple 2-tier app...,"[Scaling, Relational Database]","[autosc, relat databas]"
111,"compute, data handling, cloud service essentials",Demonstrate a multi-region deployment of a web...,"[Multi-Region, Redundancy & Replication]","[multi region, replic]"


# **4) Mapping to Cloud Products and Report Generation**
Two types of output are generated here:
- An excel file with 4 sheets and with the technical details of the categorized sentences and extracted features
- A PDF file representing a final report, with plots, specification of the extracted requirements and with mapping to the service types of the main service providers

## **Count of single cloud features**
The dataframe generated here: ['Main Category', 'Subcategory', 'Feature', 'Count']

In [None]:
# Extracting and counting cloud features within each main category and subcategory
main_category_feature_counts = {}

# Looping through each main category and their subcategories
for main_category, subcategories in ontology_dict.items():
    for subcategory, features in subcategories.items():
        for feature in features:
            # Counting occurrences of each feature
            feature_count = df_requirements['Cloud Features'].apply(lambda x: feature in x).sum()
            if feature_count > 0:  # Only adding features that are present
                if main_category not in main_category_feature_counts:
                    main_category_feature_counts[main_category] = {}
                if subcategory not in main_category_feature_counts[main_category]:
                    main_category_feature_counts[main_category][subcategory] = {}
                main_category_feature_counts[main_category][subcategory][feature] = feature_count


# Flattening the main_category_feature_counts for visualization
flattened_data = []
for main_category, subcats in main_category_feature_counts.items():
    for subcategory, features in subcats.items():
        for feature, count in features.items():
            flattened_data.append((main_category, subcategory, feature, count))

# Converting to DataFrame for easier plotting
single_features_df = pd.DataFrame(flattened_data, columns=['Main Category', 'Subcategory', 'Feature', 'Count'])


"""
# Creating a bar plot for visualization
plt.figure(figsize=(15, 10))
sns.barplot(x='Count', y='Feature', hue='Main Category', data=single_features_df)
plt.xlabel('Counts')
plt.ylabel('Cloud Features')
plt.title(f'Counts of Cloud Features by Main Category')
plt.legend(title='Main Category')
plt.show()
"""

single_features_df

#single_features_df.to_csv('./single_features_df.csv', index=False)

Unnamed: 0,Main Category,Subcategory,Feature,Count
0,compute,Resource Types,Virtual Machine,7
1,compute,Resource Types,Containerization,3
2,compute,Resource Types,Serverless,2
3,compute,Elasticity,Scaling,8
4,compute,Elasticity,Load Balancing,4
5,compute,Host Provisioning,Auto-Provisioning,1
6,data handling,Data Storage,Data Storage,5
7,data handling,Database Systems,Relational Database,4
8,data handling,Database Systems,NoSQL Database,2
9,network,Infrastructure,Local Network,6


## **Count of grouped cloud features**

In [None]:
# Calcolare il conteggio per ogni combinazione unica di "Cloud Features"
combination_counts = df_requirements['Cloud Features'].value_counts()

# Creazione di un DataFrame per la visualizzazione
combinations_features_df = pd.DataFrame(combination_counts).reset_index()
combinations_features_df.columns = ['Cloud Features Combination', 'Count']

# Conversione delle liste in stringhe per una migliore visualizzazione
combinations_features_df['Cloud Features Combination'] = combinations_features_df['Cloud Features Combination'].apply(lambda x: ', '.join(x))

"""
# Visualization
plt.figure(figsize=(15, len(combinations_features_df) / 2))
sns.barplot(y='Cloud Features Combination', x='Count', data=combinations_features_df, orient='h')
plt.xlabel('Counts')
plt.ylabel('Cloud Features Combinations')
plt.title('Counts of Cloud Features Combinations')
plt.show()
"""


#combinations_features_df.to_csv('./combinations_features_df.csv', index=False)

"\n# Visualization\nplt.figure(figsize=(15, len(combinations_features_df) / 2))\nsns.barplot(y='Cloud Features Combination', x='Count', data=combinations_features_df, orient='h')\nplt.xlabel('Counts')\nplt.ylabel('Cloud Features Combinations')\nplt.title('Counts of Cloud Features Combinations')\nplt.show()\n"

## **Excel File Generation**
The file contains the following 4 sheets:

1. "Requirements": For each sentence, it shows which main categories are associated, and which features were extracted from it (through keyword search, which are shown)

2. "Single Features": The various features extracted are shown, grouped by Main Category and Subcategory, with each showing the total count of occurrences found

3. "Combinations Features": How exactly the phrases were categorized are shown

4. "Non-Requirements": Phrases that the ML model found to be non-requirements, i.e., not relevant to any category, are shown

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.1.9


In [None]:
writer = pd.ExcelWriter('AI_generated_final.xlsx', engine='xlsxwriter')

# Write each df in a different sheet
df_requirements.to_excel(writer, sheet_name='Requirements', index=False)
single_features_df.to_excel(writer, sheet_name='Single Features', index=False)
combinations_features_df.to_excel(writer, sheet_name='Combinations Features', index=False)
df_non_requirements.to_excel(writer, sheet_name='Non-Requirements', index=False)

writer.save()


  writer.save()


## **Final PDF Report Generation**

In [None]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.0.9-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: reportlab
Successfully installed reportlab-4.0.9


In [None]:
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
import io # store plots for pdf

### **Preparation of Charts for Insertion**

In [None]:
# FIRST PIECHART: show % of sentences that represents Requirements/Non-Requirements
sizes = [len(df_non_requirements), len(df_requirements)]
labels = ['Non-Requirements', 'Requirements']
colors = ['firebrick', 'mediumseagreen']

# Create a buffer to hold the image
buf_1 = io.BytesIO()

# Create the pie chart
plt.figure(figsize=(10, 7))
patches, texts, autotexts = plt.pie(sizes, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # drawn as a circle
plt.legend(patches, labels, loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()


# Save the plot to the buffer
plt.savefig(buf_1, format='png')
plt.close()  # free up memory

# seek back to the start before reading it for insertion into a PDF
buf_1.seek(0)

###################################################
# SECOND PIECHART

# Summarizing total counts for each main category (filtering ones with 0)
category_counts = single_features_df.groupby('Main Category')['Count'].sum().to_dict()
category_counts = {category: count for category, count in category_counts.items() if count > 0}

# Creating a DataFrame for easier handling
category_counts_df = pd.DataFrame(list(category_counts.items()), columns=['Main Category', 'Count'])

# Defining a color palette that can reflect the nature of each category
color_map = {
    'compute': 'skyblue',
    'data handling': 'lightcoral',
    'network': 'dodgerblue',
    'security & compliance': 'darkolivegreen',
    'management & monitoring': 'slateblue',
    'cloud service essentials': 'chocolate'
}
category_colors = [color_map[category] for category in category_counts_df['Main Category']]

# Create a buffer to hold the image
buf_2 = io.BytesIO()

# Plotting the pie chart
plt.figure(figsize=(10, 7))
patches, texts, autotexts = plt.pie(category_counts_df['Count'], autopct='%1.1f%%', startangle=140, colors=category_colors)
plt.axis('equal')  # drawn as a circle
plt.legend(patches, category_counts_df['Main Category'], loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()

# Save the plot to the buffer
plt.savefig(buf_2, format='png')
plt.close()  # free up memory

# seek back to the start before reading it for insertion into a PDF
buf_2.seek(0)

0

### **First Page**
It contains pie charts summarizing the pipeline

In [None]:
# + --> vai in alto
# - --> vai in basso

pdf_path = "cloud_requirements_specification_report.pdf"
c = canvas.Canvas(pdf_path, pagesize=A4)
width, height = A4  # Use A4 page size

# Set a bold title for the PDF
c.setFont("Helvetica-Bold", 18)  # Setting the font to Helvetica-Bold with size 18
c.drawCentredString(width / 2, height - 80, "Cloud Requirements Specification Report")

# Reset the font for the rest of the text
c.setFont("Helvetica-Bold", 12)

# Image width after scaling
image_width = 400
# Center the image by calculating the X position
image_x = (width - image_width) / 2


# First plot title and image
first_plot_title_y = height - 150  # Adjusted to ensure visibility
c.drawString(50, first_plot_title_y, "A) Requirements VS Non-Requirements")
buf_1.seek(0)
image_1 = ImageReader(buf_1)
first_image_y = first_plot_title_y - 20
# Centrally adjust image position within the upper half of the A4 page
c.drawImage(image_1, image_x, first_image_y - 300, width=image_width, height=300, preserveAspectRatio=True, mask='auto')

# Second plot title and image
second_plot_title_y = height / 2 - 80  # Positioned for the second half of the page
c.drawString(50, second_plot_title_y, "B) Cloud Service Requirements")
buf_2.seek(0)
image_2 = ImageReader(buf_2)
second_image_y = second_plot_title_y - 20
# Centrally adjust image position within the lower half of the A4 page
c.drawImage(image_2, image_x, second_image_y - 300, width=image_width, height=300, preserveAspectRatio=True, mask='auto')


buf_1.close()
buf_2.close()

pdf_path


'cloud_requirements_specification_report.pdf'

### **Second Page**
Refinement of requirements that is free of inconsistencies, omissions, or redundancies is provided and producing a numbered list of requirements that is as unambiguous as possible

In [None]:
# New page for detailed specifications
c.showPage()

# Title
c.setFont("Helvetica-Bold", 12)
c.drawString(50, height - 80, "C) Refinement of Cloud Service Requirements")

# Start content from a specific height
current_y = height - 100
line_height = 14

# Check for page overflow and adjust accordingly
def check_page(y):
    if y < 50:  # Assuming 50 as bottom margin
        c.showPage()
        c.setFont("Helvetica", 10)  # Reset font for new page
        return height - 100  # Reset Y position for new content at the top of the new page
    return y

main_category_counter = 1  # counter for main categories

# Iterate through each main category and its subcategories
for main_category, subcats in main_category_feature_counts.items():
    current_y -= line_height * 2  # Space before a new main category
    current_y = check_page(current_y)
    # Main category in regular font
    c.setFont("Helvetica", 10)
    c.drawString(50, current_y, f'{main_category_counter}) The following requirements were extracted for main category "{main_category.capitalize()}":')

    subcat_counter = 1
    for subcategory, features in subcats.items():
        current_y -= line_height
        current_y = check_page(current_y)
        # Subcategory in regular font
        c.drawString(70, current_y, f'    {main_category_counter}.{subcat_counter}) Cloud features for subcategory "{subcategory.capitalize()}" are listed below:')

        feature_counter = 1
        for feature, count in features.items():
            current_y -= line_height
            current_y = check_page(current_y)
            # Feature with count in regular font, count is styled subtly
            feature_text = f"        {main_category_counter}.{subcat_counter}.{feature_counter}) {feature} ({count})"
            c.drawString(90, current_y, feature_text)
            feature_counter += 1

        subcat_counter += 1

    main_category_counter += 1  # Increment for the next main category

pdf_path

'cloud_requirements_specification_report.pdf'

### **Third Page**
Mapping of extracted features to corresponding service types of major cloud providers

In [None]:
# New page for mapping features to cloud providers
c.showPage()

# Title
c.setFont("Helvetica-Bold", 14)
c.drawString(50, height - 80, "D) Cloud Providers Feature Mapping")

# Reset the starting Y position for the content below the title
current_y = height - 100
line_height = 18

# Check for page overflow and adjust accordingly
def check_page(y):
    if y < 60:  # Assuming 60 as the bottom margin
        c.showPage()
        #c.setFont("Helvetica-Bold", 14)
        return height - 70  # Reset Y position for new content at the top of the new page
    return y

# Iterate through each main category to list features and their mappings
for main_category, subcats in main_category_feature_counts.items():
    current_y -= line_height  # Space before a new main category
    current_y = check_page(current_y)
    c.setFont("Helvetica-Bold", 12)
    c.drawString(50, current_y, f"Category: {main_category.capitalize()}")  # Main category in bold

    for subcategory, features in subcats.items():
        for feature in features.keys():
            current_y -= line_height
            current_y = check_page(current_y)
            c.setFont("Helvetica-Bold", 10)  # Feature names in bold
            c.drawString(60, current_y, f"Feature: {feature}")
            if feature in PROVIDERS:
                aws, google, azure = PROVIDERS[feature]
                # AWS mapping
                current_y -= line_height - 5  # Slightly reduced spacing for provider details
                current_y = check_page(current_y)
                c.setFont("Helvetica", 9)  # Smaller font for provider details
                c.drawString(70, current_y, f"AWS: {aws}")
                # Google mapping
                current_y -= line_height - 5
                current_y = check_page(current_y)
                c.drawString(70, current_y, f"Google: {google}")
                # Azure mapping
                current_y -= line_height - 5
                current_y = check_page(current_y)
                c.drawString(70, current_y, f"Azure: {azure}")

# Finalize and save the PDF
c.save()

pdf_path


'cloud_requirements_specification_report.pdf'