# Install needed Libraries

## Install Libraries from pip

In [215]:
!pip install langchain langchain-community pandas numpy



## Import needed Libraries

In [218]:
import pandas as pd
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Convert Excel Spreadsheet to pandas Data Frame

In [221]:
# Read Excel containig List of URL's with Architectural Pattern and Metadata.
url_df = pd.read_excel("./URLs.xlsx", sheet_name="Sheet1")
# Show shape of DataFrame
print("Shape: ",url_df.shape)
# Show the Format of the Data Frame
url_df.head()

Shape:  (213, 6)


Unnamed: 0,URL,1st Level,2nd Level,3rd Level,4th Level,Lens
0,https://docs.aws.amazon.com/wellarchitected/la...,Abstract and Introducción,,,,Serverless Applications
1,https://docs.aws.amazon.com/wellarchitected/la...,Definitions,,,,Serverless Applications
2,https://docs.aws.amazon.com/wellarchitected/la...,Definitions,Compute Layers,,,Serverless Applications
3,https://docs.aws.amazon.com/wellarchitected/la...,Definitions,Data Layer,,,Serverless Applications
4,https://docs.aws.amazon.com/wellarchitected/la...,Definitions,Messaging and streaming layer,,,Serverless Applications


The data frame shows multiple links where the data is stored. The number of links can be seen in the data frame's shape; we have a total of X links.

To obtain additional metadata for each link, we created a structure that includes the name of the AWS Well-Architected Lens in the "Lens" column, and its subsections in the "1st Level," "2nd Level," "3rd Level," and "4th Level" columns. If a link points to information at an upper level, the lower-level columns will display NaN.

# Read each link and store the Data in correct Format

## Create Function to add Level to metadata

In [226]:
# We create a function to validate if a level exist in a row of the dataframe
def createMetadataLevel(level,url_line,metadata):
    #Validate if the Level is enot empty
    if(not pd.isna(url_line[level])):
        #If level is not empty add the level to the metadata
        metadata[level]=url_line[level]
    #Return the modified metadata.
    return metadata

## Create function to load the URL with the extra metadata.

In [229]:
def loadURLWithMetaData(url_line):
    # We define the loader, which will read the information in the URL's leveraging the langchain library.
    loader = WebBaseLoader(
        # We say, which URL will be read and loaded.
        url_line["URL"],
    )
    # We will read the URL and get different documents from all the paragraphs.
    docs = loader.load()
    # We define all the metadata to add to the docs read from this page
    metadata = {
        "Lens": url_line["Lens"],
        "1st Level": url_line["1st Level"]
    }
    # Add all levels of metadata, validating the level exists.
    metadata = createMetadataLevel("2nd Level",url_line,metadata)
    metadata = createMetadataLevel("3rd Level",url_line,metadata)
    metadata = createMetadataLevel("4th Level",url_line,metadata)
    #Print the metadata
    print(metadata)
        
    

## Cycle trough all URL's in the list and load them

In [232]:
#Cycle trough all URL's to load them as text and add the desired metadata.
for index, row in url_df.iterrows():
    loadURLWithMetaData(row)

<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Abstract and Introducción '}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions'}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions', '2nd Level': 'Compute Layers'}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions', '2nd Level': 'Data Layer'}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions', '2nd Level': 'Messaging and streaming layer'}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions', '2nd Level': 'User management and identity layer'}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions', '2nd Level': 'Edge layer'}
<class 'pandas.core.series.Series'>
{'Lens': 'Serverless Applications', '1st Level': 'Definitions', '2n