# Preparing the Environment with Packages and Ollama

In [1]:
!pip install python-docx langchain-community ollama colab-xterm sentence_transformers gradio

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting ollama
  Downloading ollama-0.3.3-py3-none-any.whl.metadata (3.8 kB)
Collecting colab-xterm
  Downloading colab_xterm-0.2.0-py3-none-any.whl.metadata (1.2 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.4.0,>=0.3.3 (from langchain-community)
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain-community)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#For Setting Up Ollama in Windows: Refer to the below Youtube Video
https://youtu.be/h5jqmjpB5nk?feature=shared

# Following Steps are for usage of Ollama in Google Colab

In [3]:
%load_ext colabxterm

In [5]:
%xterm
#Execute in the below terminal:
#curl -fsSL https://ollama.com/install.sh | sh
#ollama serve - till this is shown "ver=12.2 name="Tesla T4" total="14.7 GiB" available="14.6 GiB"
#Then run this cell again and enter the command below till 'Success is displayed. Then move on to the next cell.
#ollama pull llama3.1

Launching Xterm...

<IPython.core.display.Javascript object>

# Necessary Imports

In [6]:
import os
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]="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"

In [11]:
from docx import Document
from langchain_community.embeddings import OllamaEmbeddings
from sentence_transformers import util, SentenceTransformer
import gradio as gr

# Parsing the Document with Tables

In [8]:
embeddings = OllamaEmbeddings()
embedding_model = OllamaEmbeddings(model="llama3.1")

def extract_tables_from_docx(docx_path):
    doc = Document(docx_path)
    tables = []
    for table in doc.tables:
        table_content = []
        for row in table.rows:
            row_content = [cell.text.strip() for cell in row.cells]
            table_content.append(row_content)
        tables.append(table_content)
    return tables
docx_path = "/content/drive/MyDrive/Tables Document.docx"
tables = extract_tables_from_docx(docx_path)
tables

  embeddings = OllamaEmbeddings()


[[['Product ID', 'Product Name', 'Category', 'Price (INR)'],
  ['1', 'Apple', 'Fruits', '100'],
  ['2', 'Carrot', 'Vegetables', '50'],
  ['3', 'Milk', 'Dairy', '60'],
  ['4', 'Bread', 'Bakery', '30']],
 [['Student ID', 'Name', 'Subject', 'Grade'],
  ['101', 'Priya', 'Math', 'A'],
  ['102', 'Rahul', 'English', 'B'],
  ['103', 'Sneha', 'Science', 'A'],
  ['104', 'Abhi', 'History', 'C']],
 [['Employee ID', 'Name', 'Department', 'Salary (INR)'],
  ['E001', 'Anil Kumar', 'IT', '80,000'],
  ['E002', 'Riya Sen', 'HR', '60,000'],
  ['E003', 'Sandeep R', 'Finance', '70,000']],
 [['Movie ID', 'Title', 'Genre', 'Show Time'],
  ['1', 'Inception', 'Sci-Fi', '10:00 AM'],
  ['2', 'Titanic', 'Romance', '2:00 PM'],
  ['3', 'Avengers', 'Action', '6:00 PM']],
 [['City', 'Date', 'Temperature (°C)', 'Condition'],
  ['Chennai', '12-Oct-24', '32', 'Sunny'],
  ['Mumbai', '12-Oct-24', '29', 'Rainy'],
  ['Bangalore', '12-Oct-24', '25', 'Cloudy']]]

#Generation of Embeddings of the Table Content based on Llama 3.1 paired with Ollama

In [9]:
table_string_holder = {}
for index_value,table in enumerate(tables):
  table_string_holder['table_'+str(index_value+1)] = "\n".join([" ".join(row) for row in table])
locals().update(table_string_holder)
table_strings=[]
for table_string in table_string_holder.values():
  table_strings.append(table_string)
table_embeddings = []
for table_str in table_strings:
    embedding = embedding_model.embed_documents([table_str])
    table_embeddings.append(embedding[0])
print(table_embeddings)

[[-2.0628063678741455, -4.0592041015625, -1.0040291547775269, 1.4461572170257568, 0.25787514448165894, -0.5351004004478455, -0.1980595737695694, 1.0846407413482666, -1.9579005241394043, 0.11094343662261963, 0.10556744784116745, -2.4125380516052246, 1.1061331033706665, 1.0046257972717285, 1.2690426111221313, 0.9685865640640259, -1.2431185245513916, -0.6118373870849609, -0.025279471650719643, 2.397071361541748, -3.361311435699463, 0.40335920453071594, -0.9871957302093506, 1.345530390739441, -0.8549553155899048, -0.6862837076187134, 0.33496028184890747, 0.4847126305103302, 3.0248379707336426, 0.4623250365257263, -0.3999383747577667, 2.478717088699341, 1.702760934829712, 1.2849369049072266, 6.66855525970459, -2.023935317993164, -0.6525655388832092, -1.8916646242141724, 3.0408647060394287, 2.8824546337127686, -0.15710368752479553, 2.108288526535034, -0.025506338104605675, -2.5229671001434326, -0.38676872849464417, 0.4435170292854309, -3.1982955932617188, 3.4110023975372314, 1.04507136344909

# Using Cosine Similarity on the generated Embeddings to measure Semantic Similarity between Tables.

In [14]:
similarity_matrix = util.cos_sim(table_embeddings, table_embeddings)

def get_sorted_similarities(similarity_matrix):
    pairs = []
    num_tables = len(similarity_matrix)

    for i in range(num_tables):
        for j in range(i + 1, num_tables):
            pairs.append(((i + 1, j + 1), similarity_matrix[i][j]))

    return sorted(pairs, key=lambda x: x[1], reverse=True)

sorted_pairs = get_sorted_similarities(similarity_matrix)

def convert_table_to_html(table):
    html = "<table border='1' style='border-collapse: collapse; margin: 10px;'>"
    for row in table:
        html += "<tr>"
        for cell in row:
            html += f"<td style='padding: 5px; text-align: left;'>{cell}</td>"
        html += "</tr>"
    html += "</table>"
    return html

def display_similarity():
    content = ""
    for (i, j), score in sorted_pairs:
        content += f"<h3>Table {i} <--> Table {j} | Similarity: {score:.4f}</h3>"
        content += f"<div style='display: flex; gap: 20px;'>"
        content += convert_table_to_html(tables[i - 1])
        content += convert_table_to_html(tables[j - 1])
        content += "</div><hr>"
    return content

interface = gr.Interface(
    fn=display_similarity,
    inputs=[],
    outputs="html",
    title="Semantic Similarity Between Tables using Llama 3.1, a general purpose Chat Model.",
    description="Displays table pairs along with their similarity scores."
)

interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d12941b6bb0d3fff20.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Implementing the same as above using a Different Model

In [15]:
embedding_model=SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
sentence_1="Profit"
sentence_2="Gain"
embedding_1 = embedding_model.encode([sentence_1])
embedding_2 = embedding_model.encode([sentence_2])
util.cos_sim(embedding_1,embedding_2)

tensor([[0.5219]])

In [18]:
table_string_holder = {}
for index_value,table in enumerate(tables):
  table_string_holder['table_'+str(index_value+1)] = "\n".join([" ".join(row) for row in table])
locals().update(table_string_holder)
table_strings=[]
for table_string in table_string_holder.values():
  table_strings.append(table_string)
table_embeddings = []
for table_str in table_strings:
  embedding = embedding_model.encode(table_str)
  table_embeddings.append(embedding)

In [19]:
similarity_matrix = util.cos_sim(table_embeddings, table_embeddings)

def get_sorted_similarities(similarity_matrix):
    pairs = []
    num_tables = len(similarity_matrix)

    for i in range(num_tables):
        for j in range(i + 1, num_tables):
            pairs.append(((i + 1, j + 1), similarity_matrix[i][j]))

    return sorted(pairs, key=lambda x: x[1], reverse=True)

sorted_pairs = get_sorted_similarities(similarity_matrix)

def convert_table_to_html(table):
    html = "<table border='1' style='border-collapse: collapse; margin: 10px;'>"
    for row in table:
        html += "<tr>"
        for cell in row:
            html += f"<td style='padding: 5px; text-align: left;'>{cell}</td>"
        html += "</tr>"
    html += "</table>"
    return html

def display_similarity():
    content = ""
    for (i, j), score in sorted_pairs:
        content += f"<h3>Table {i} <--> Table {j} | Similarity: {score:.4f}</h3>"
        content += f"<div style='display: flex; gap: 20px;'>"
        content += convert_table_to_html(tables[i - 1])
        content += convert_table_to_html(tables[j - 1])
        content += "</div><hr>"
    return content

interface = gr.Interface(
    fn=display_similarity,
    inputs=[],
    outputs="html",
    title="Semantic Similarity Between Tables with a Sentence Transformers Model dedicated for Semantic Similarities.",
    description="Displays table pairs along with their similarity scores."
)

interface.launch()

  a = torch.tensor(a)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ebcc84ad3d364b55b8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


