# Section 1

## Setup

Before get started with the Vertex AI services, we need to setup the following.

* Install Python SDK
* Environment variables
* Enable APIs

## Step 1: Install Necessary Packages and Python SDK

In [1]:
pip install --upgrade --user google-cloud-aiplatform google-cloud-storage google-cloud-bigquery[pandas] 


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\DELL\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
!pip install streamlit pandas google-cloud-aiplatform google-cloud-bigquery numpy tqdm




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\DELL\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Step 2: Set Up Environment Variables and Initialize Clients

In [3]:
from datetime import datetime
import os
from google.cloud import bigquery, aiplatform
import pandas as pd
import numpy as np
import time
import tqdm
from vertexai.preview.language_models import TextEmbeddingModel

# Constants
LOCATION = "us-central1"
UID = datetime.now().strftime("%m%d%H%M")
PROJECT_ID = 'tokyo-country-189103'  
os.environ['PATH'] += os.pathsep + 'C:/Users/DELL/AppData/Local/Google/Cloud SDK/google-cloud-sdk/bin'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r"C:\Users\DELL\Python projects\DocSync\V2\Key\tokyo-country-189103-4ce23189dd39.json"

# Enable GCP services
!gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com bigquery.googleapis.com --project {PROJECT_ID}

# Initialize BigQuery and Vertex AI clients
bq_client = bigquery.Client(project=PROJECT_ID)
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Load the text embeddings model
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


Operation "operations/acat.p2-303276163211-303b7d30-d728-4366-ac56-975dee9fe5e6" finished successfully.



## Step 3: Define Helper Functions

In [15]:
## Step 3: Define Helper Functions
BATCH_SIZE = 5

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

def compare_requirements(new_df, old_df):
    new_df = new_df.assign(embedding=get_embeddings_wrapper(list(new_df.Requirement)))
    old_df = old_df.assign(embedding=get_embeddings_wrapper(list(old_df.Requirement)))

    # Convert the embeddings to numpy arrays for similarity calculation
    new_embeddings = np.array(new_df['embedding'].tolist())
    old_embeddings = np.array(old_df['embedding'].tolist())

    # Calculate the similarity matrix using dot product
    similarity_matrix = np.dot(new_embeddings, old_embeddings.T)

    # Find the most similar requirement in the old excel for each requirement in the new excel
    most_similar_indices = np.argmax(similarity_matrix, axis=1)
    most_similar_scores = np.max(similarity_matrix, axis=1)

    # Create the comparison dataframe
    comparison_df = pd.DataFrame({
        'new_requirement_id': new_df['Requirement ID'],
        'new_requirement_text': new_df['Requirement'],
        'most_similar_requirement_text': old_df['Requirement'].iloc[most_similar_indices].values,
        'most_similar_requirement_id': old_df['Requirement ID'].iloc[most_similar_indices].values,
        'matching_percentage': most_similar_scores
    })
    return comparison_df








## Step 4: Write the Streamlit Script to a File

In [13]:
import os
from datetime import datetime
from google.cloud import bigquery, aiplatform
import pandas as pd
import numpy as np
import time
import tqdm
from vertexai.preview.language_models import TextEmbeddingModel
import streamlit as st

# Constants
LOCATION = "us-central1"
UID = datetime.now().strftime("%m%d%H%M")
PROJECT_ID = 'tokyo-country-189103'
os.environ['PATH'] += os.pathsep + 'C:/Users/DELL/AppData/Local/Google/Cloud SDK/google-cloud-sdk/bin'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r"C:\Users\DELL\Python projects\DocSync\V2\Key\tokyo-country-189103-4ce23189dd39.json"

# Initialize BigQuery and Vertex AI clients
bq_client = bigquery.Client(project=PROJECT_ID)
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Load the text embeddings model
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

# Helper functions
BATCH_SIZE = 5

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

def compare_requirements(new_df, old_df):
    new_df = new_df.assign(embedding=get_embeddings_wrapper(list(new_df.Requirement)))
    old_df = old_df.assign(embedding=get_embeddings_wrapper(list(old_df.Requirement)))

    # Convert the embeddings to numpy arrays for similarity calculation
    new_embeddings = np.array(new_df['embedding'].tolist())
    old_embeddings = np.array(old_df['embedding'].tolist())

    # Calculate the similarity matrix using dot product
    similarity_matrix = np.dot(new_embeddings, old_embeddings.T)

    # Find the most similar requirement in the old excel for each requirement in the new excel
    most_similar_indices = np.argmax(similarity_matrix, axis=1)
    most_similar_scores = np.max(similarity_matrix, axis=1)

    # Create the comparison dataframe
    comparison_df = pd.DataFrame({
        'new_requirement_id': new_df['Requirement ID'],
        'new_requirement_text': new_df['Requirement'],
        'most_similar_requirement_text': old_df['Requirement'].iloc[most_similar_indices].values,
        'most_similar_requirement_id': old_df['Requirement ID'].iloc[most_similar_indices].values,
        'matching_percentage': most_similar_scores
    })
    return comparison_df

# Streamlit UI
st.title("DocSync")

st.subheader("New Excel File")
new_excel = st.file_uploader("Choose the new Excel file", type="xlsx")

st.subheader("Old Excel File")
old_excel = st.file_uploader("Choose the old Excel file", type="xlsx")

if st.button("Run Comparison"):
    if new_excel is not None and old_excel is not None:
        st.write("Processing...")

        new_df = pd.read_excel(new_excel)
        old_df = pd.read_excel(old_excel)
        
        progress_bar = st.progress(0)
        
        progress_bar.progress(10)
        comparison_df = compare_requirements(new_df, old_df)
        progress_bar.progress(100)
        
        st.success("Comparison finished!")
        
        st.write("Download the comparison file:")
        st.download_button(
            label="Download comparison_df",
            data=comparison_df.to_csv(index=False),
            file_name='comparison_df.csv',
            mime='text/csv'
        )
    else:
        st.error("Please upload both Excel files.")


Overwriting app.py


In [16]:
#python -m streamlit run app.py