In [26]:
import torch
from sentence_transformers import util
from typing import List, Tuple, Union, Any

In [34]:
 def retrieve_top_k_matches(queries:list, corpus, eco_prod:list,k:int)-> tuple[Any, Any]:
        """
        Takes in list of queries/item_name, the list of ecoinvent product_name, topK value
        This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings
        returns topk matchs and scores
        """
        top_k = k
        results = []
        scores=[]
        for query in queries:
            query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
            # We use cosine-similarity and torch.topk
            cos_scores = util.cos_sim(query_embedding, corpus)[0]##https://github.com/UKPLab/sentence-transformers/blob/42ab80a122ad521a4f9055f5530a954d29d232ce/sentence_transformers/util.py#L23
            top_results = torch.topk(cos_scores, k=top_k)#https://pytorch.org/docs/stable/generated/torch.topk.html

            for score, idx in zip(top_results[0], top_results[1]):
                #(eco_prod[idx], "(Score: {:.4f})".format(score))
                results.append(eco_prod[idx])
                scores.append(float("(Score: {:.4f})".format(score)[8:14]))
        if top_k > 1:
            results=[results[i:i+k] for i in range(0, len(results), k)]
            scores=[scores[i:i+k] for i in range(0, len(scores), k)]
        return (results, scores)


In [35]:
test_1323 = retrieve_top_k_matches('zinc monosulfate', corpus_embeddings, corpus, k=5)

In [36]:
test_1323[0]

[['zirconium oxide', 'pyrazole', 'zinc monosulfate', 'zinc oxide', 'pyridine'],
 ['aniline',
  'adipic acid',
  'benzimidazole-compound',
  'indium',
  'phenyl isocyanate'],
 ['calcium nitrate',
  'sodium nitrate',
  'sodium nitrite',
  'nitrogen, liquid',
  'potassium nitrate'],
 ['steel, unalloyed',
  'hexane',
  'vinyl carbonate',
  'ascorbic acid',
  'petroleum coke'],
 ['propyl amine', 'allyl chloride', 'propane', '1-propanol', 'acrolein'],
 ['lithium manganese oxide',
  'N-methyl-2-pyrrolidone',
  'melamine',
  'N,N-dimethylformamide',
  'chloromethyl methyl ether'],
 ['o-dichlorobenzene',
  'o-chlorotoluene',
  'diphenylether-compound',
  'dioxane',
  'benzyl chloride'],
 ['calcium nitrate',
  'sodium nitrate',
  'sodium nitrite',
  'nitrogen, liquid',
  'potassium nitrate'],
 ['o-dichlorobenzene',
  'o-chlorotoluene',
  'diphenylether-compound',
  'dioxane',
  'benzyl chloride'],
 ['sodium sulfate, anhydrite',
  'dimethyl sulfate',
  'sulfuryl chloride',
  'potassium sulfate',


In [11]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import pickle
import pandas as pd


bi_encoder = SentenceTransformer("ronanki/all-mpnet-base-v2-2022-11-07")

with open('/Users/avinashronanki/PycharmProjects/machine_learning/Emitter_Search/recommendations/model_6_prod.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    corpus = stored_data['sentences']
    corpus_embeddings = stored_data['embeddings']

def search(query, top_k=5):
    results = []
    # scores =[]
    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    # print(hits)
    hits = hits[0]  # Get the hits for the first query
    # score = hits[1]
    for idx, hit in enumerate(hits[0:5]):
        results.append(corpus[hit['corpus_id']])
        # scores.append(score)
    return results

In [23]:
 def negative_creation(queries:list,eco_prod:list,k:int)-> Tuple[List[list]]:
        """
        it is same as the match function but appends only the nth value
        suggested k values are 6 10 12 'hard negatives'
        Takes in list of queries/item_name, the list of ecoinvent product_name, Nth value
        This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings
        returns nth matchs and scores
        """
        top_k = k
        results = []
        scores=[]
        for query in queries:
            query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
            # We use cosine-similarity and torch.topk
            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] #https://github.com/UKPLab/sentence-transformers/blob/42ab80a122ad521a4f9055f5530a954d29d232ce/sentence_transformers/util.py#L23
            top_results = torch.topk(cos_scores, k=top_k)#https://pytorch.org/docs/stable/generated/torch.topk.html

            for score, idx in zip(top_results[0], top_results[1]):
                (eco_prod[idx], "(Score: {:.4f})".format(score))
            results.append(eco_prod[idx])
            scores.append(float("(Score: {:.4f})".format(score)[8:14]))
        return (results,scores)

In [24]:
negative_creation('zinc monosulfate',corpus, k=5 )

(['pyridine',
  'phenyl isocyanate',
  'potassium nitrate',
  'petroleum coke',
  'acrolein',
  'chloromethyl methyl ether',
  'benzyl chloride',
  'potassium nitrate',
  'benzyl chloride',
  'calcium borates',
  '2,4-dichlorotoluene',
  'aluminium chloride',
  'fluorine, liquid',
  'anthranilic acid',
  'terbium oxide',
  'vinyl chloride'],
 [0.3463,
  0.2226,
  0.3693,
  0.2346,
  0.268,
  0.3054,
  0.2456,
  0.3693,
  0.2456,
  0.3103,
  0.2745,
  0.3704,
  0.2264,
  0.2606,
  0.2584,
  0.2833])

In [None]:
import pandas as pd
from typing import Any

def vlookup(df1: pd.DataFrame, col1: str, df2: pd.DataFrame, col2: str, output_col: str) -> pd.DataFrame:
    """
    Performs a vlookup-like operation on two dataframes and returns a new dataframe with the specified columns.

    Args:
        df1: The first dataframe.
        col1: The name of the column in df1 to merge on.
        df2: The second dataframe.
        col2: The name of the column in df2 to merge on.
        output_col: The name of the column from df2 to include in the output.

    Returns:
        A new dataframe with the specified columns.
    """
    # Perform a merge operation on the two dataframes
    merged_df = pd.merge(df1, df2[[col2, output_col]], left_on=col1, right_on=col2, how='left')

    # Drop the duplicate column and rename the output column
    merged_df = merged_df.drop(col2, axis=1).rename(columns={output_col: f'{col1}_{output_col}'})

    return merged_df



In [None]:
output_df = vlookup(df1, 'ID', df2, 'Key', 'Value')

In [None]:
from sklearn.model_selection import train_test_split

def split_data(data, label_column, test_size=0.2, eval_size=0.1, random_state=42):
    """
    Split data into train, test, and eval sets using stratified sampling.

    Args:
        data (pd.DataFrame): The data to split.
        label_column (str): The name of the column containing the labels.
        test_size (float): The proportion of the data to use for testing.
        eval_size (float): The proportion of the data to use for evaluation.
        random_state (int): The random seed to use for the train-test split.

    Returns:
        pd.DataFrame: The data with an additional column "split" containing "train", "test", or "eval" for each row.
    """

    # Split the data into train and test sets, stratifying on the label column
    train_data, test_data, train_labels, test_labels = train_test_split(
        data.drop(columns=[label_column]), data[label_column], test_size=test_size, stratify=data[label_column], random_state=random_state
    )

    # Split the remaining data into eval and test sets, stratifying on the label column
    eval_data, test_data, eval_labels, test_labels = train_test_split(
        test_data.drop(columns=[label_column]), test_labels, test_size=eval_size / (1 - test_size), stratify=test_labels, random_state=random_state
    )

    # Add a "split" column to each set
    train_data["split"] = "train"
    test_data["split"] = "test"
    eval_data["split"] = "eval"

    # Combine the sets back into a single dataframe
    split_data = pd.concat([train_data, test_data, eval_data], axis=0)

    return split_data


In [48]:
!pip install nltk
!pip install gensim

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting gensim
  Using cached gensim-4.3.0-cp38-cp38-macosx_10_9_x86_64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Collecting scipy>=1.7.0
  Using cached scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl (35.0 MB)
Collecting FuzzyTM>=0.4.0
  Using cached FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume
  Using cached pyFUME-0.2.25-py3-none-any.whl (67 kB)
Co

In [1]:



import nltk
import gensim
from gensim.models import Word2Vec

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the pre-trained word2vec model
w2v_model = Word2Vec.load("path/to/your/model")
def hybrid_search(query, documents, num_results=10, threshold=0.5):
    # Tokenize the query and remove stopwords
    query_tokens = [token.lower() for token in nltk.word_tokenize(query) if token.lower() not in nltk.corpus.stopwords.words('english')]

    # Calculate the semantic similarity between the query and each document using the pre-trained word2vec model
    semantic_similarities = []
    for doc in documents:
        doc_tokens = [token.lower() for token in nltk.word_tokenize(doc) if token.lower() not in nltk.corpus.stopwords.words('english')]
        similarity = w2v_model.wv.n_similarity(query_tokens, doc_tokens)
        semantic_similarities.append(similarity)

    # Filter the documents based on keyword and semantic similarity
    relevant_docs = []
    for i, doc in enumerate(documents):
        keyword_similarity = sum([1 for token in query_tokens if token in doc.lower()]) / len(query_tokens)
        if keyword_similarity > threshold or semantic_similarities[i] > threshold:
            relevant_docs.append((doc, keyword_similarity, semantic_similarities[i]))

    # Sort the relevant documents by their combined keyword and semantic similarity scores
    relevant_docs = sorted(relevant_docs, key=lambda x: x[1] + x[2], reverse=True)[:num_results]

    # Return the top N relevant documents
    return [doc[0] for doc in relevant_docs]
query = "How to train a machine learning model"
documents = [
    "A beginner's guide to machine learning",
    "The importance of data preprocessing in machine learning",
    "Training a machine learning model with scikit-learn",
    "Deep learning for natural language processing"
]

results = hybrid_search(query, documents)
print(results)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/avinashronanki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/avinashronanki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/model'

In [39]:
# !pip install sentence-transformers
# !pip install rank-bm25
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [44]:

from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import nltk


# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the pre-trained sentence transformer model
sent_model = SentenceTransformer('bert-base-nli-mean-tokens')


def hybrid_search(query, documents, num_results=10, keyword_weight=0.5):
    # Tokenize the query and remove stopwords
    query_tokens = [token.lower() for token in nltk.word_tokenize(query) if token.lower() not in nltk.corpus.stopwords.words('english')]

    # Generate sentence embeddings for the query and documents
    query_embedding = sent_model.encode(query, convert_to_tensor=True)
    doc_embeddings = sent_model.encode(documents, convert_to_tensor=True)

    # Calculate the cosine similarity between the query and each document using sentence embeddings
    semantic_similarities = (query_embedding @ doc_embeddings.T).flatten().tolist()

    # Calculate the BM25 scores between the query and each document using keyword matching
    bm25_scores = BM25Okapi(documents).get_scores(query_tokens)

    # Combine the keyword and semantic similarity scores
    combined_scores = [(i, keyword_weight * bm25_scores[i] + (1 - keyword_weight) * semantic_similarities[i]) for i in range(len(documents))]

    # Sort the relevant documents by their combined scores
    relevant_docs = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:num_results]

    # Return the top N relevant documents
    return [documents[i] for i, _ in relevant_docs]


query = "How to train a machine learning model"
documents = [
    "A beginner's guide to machine learning",
    "The importance of data preprocessing in machine learning",
    "Training a machine learning model with scikit-learn",
    "Deep learning for natural language processing"
]

results = hybrid_search(query, documents)
print(results)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/avinashronanki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/avinashronanki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'fastbm25' object has no attribute 'get_scores'

In [3]:
from fastbm25 import fastbm25

query = "How to train a machine learning model"
documents = [
    "A beginner's guide to machine learning",
    "The importance of data preprocessing in machine learning",
    "Training a machine learning model with scikit-learn",
    "Deep learning for natural language processing"
]
tokenized_corpus = [doc.lower().split(" ") for doc in documents]
model = fastbm25(tokenized_corpus)
query = query.lower().split()
result = model.top_k_sentence(query,k=3)
print(result)

[(['a', "beginner's", 'guide', 'to', 'machine', 'learning'], 0, 1.19), (['training', 'a', 'machine', 'learning', 'model', 'with', 'scikit-learn'], 2, 1.1099999999999999), (['the', 'importance', 'of', 'data', 'preprocessing', 'in', 'machine', 'learning'], 1, 0.26)]


In [7]:
from sentence_transformers import SentenceTransformer
# from fastbm25 import FastBM25
import nltk
from bm25 import fastbm25 as BM25
# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the pre-trained sentence transformer model
sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

def hybrid_search(query, documents, num_results=10, keyword_weight=0.5):
    # Tokenize the query and remove stopwords
    query_tokens = [token.lower() for token in nltk.word_tokenize(query) if token.lower() not in nltk.corpus.stopwords.words('english')]

    # Generate sentence embeddings for the query and documents
    query_embedding = sent_model.encode(query, convert_to_tensor=True)
    doc_embeddings = sent_model.encode(documents, convert_to_tensor=True)

    # Calculate the cosine similarity between the query and each document using sentence embeddings
    semantic_similarities = (query_embedding @ doc_embeddings.T).flatten().tolist()

    # Calculate the BM25 scores between the query and each document using keyword matching
    bm25 = FastBM25(documents)
    bm25_scores = BM25(documents).get_scores(query_tokens)

    # Combine the keyword and semantic similarity scores
    combined_scores = [(i, keyword_weight * bm25_scores[i] + (1 - keyword_weight) * semantic_similarities[i]) for i in range(len(documents))]

    # Sort the relevant documents by their combined scores
    relevant_docs = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:num_results]

    # Return the top N relevant documents
    return [documents[i] for i, _ in relevant_docs]

query = "How to train a machine learning model"
documents = [
    "A beginner's guide to machine learning",
    "The importance of data preprocessing in machine learning",
    "Training a machine learning model with scikit-learn",
    "Deep learning for natural language processing"
]

results = hybrid_search(query, documents)
print(results)

ModuleNotFoundError: No module named 'bm25'

In [8]:
!pip install --upgrade rank-bm25

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
from rank_bm25 import BM25Okapi

ModuleNotFoundError: No module named 'rank_bm25'