In [2]:
# Setup
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


### Import schemes data and preprocess clean ###

1. Load schemes data using enriched schemes-updated-with-text.csv
2. Define preprocessing function to clean and lemmatize a text
3. Create desc_booster column combining Description, search_booster(WL), Who's it for, What it gives, Scheme Type
4. Apply preprocessing to desc_booster

In [3]:
!ls ..

Icon?
SchemesSG-main.zip
[34mSchemesSG_Yevgeniys-MacBook-Pro.local_Aug-21-224715-2024_CaseConflict[m[m
[34mSchemesSG_v3[m[m
[34mcinema-test[m[m
[34mfirebase schemes v3[m[m
[34mfirebase_schemesv2[m[m
[34mgpt4bot[m[m
[34mkruncher docs[m[m
[34mkruncher-ai[m[m
[34mmultimodal-rag-code-execution[m[m
[34mschemessg[m[m
[34mworkflowy2obsidian[m[m
[34myevkim[m[m


In [4]:
import pandas as pd
df = pd.read_csv("schemes 11 nov.csv")
df.head()

Unnamed: 0,Scheme,Description,Agency,Image,Link,Who's it for,What it gives,Scheme Type,search_booster(WL)
0,Various services,Provides casework & counselling; therapeutic g...,Montfort Care Family Service,https://chidnast.sirv.com/SchemesSG/27fsc.jpg,https://montfortcare.org.sg/services/families/,Families,"Counselling,Casework","Low Income,Family","social service, individuals, families, casewor..."
1,Financial Assistance,"Provides care and support, counselling, nutrit...",365 Cancer Prevention Society (365CPS),https://chidnast.sirv.com/SchemesSG/365cps.jpg,https://365cps.org.sg/portfolio/financial-supp...,Cancer patients,Financial assistance for cancer treatment,"Low Income,Healthcare","See doctor, cancer, oncology"
2,Food Assistance,A self setup group which distributes meal box ...,A Packet of Rice,https://chidnast.sirv.com/SchemesSG/apacketofr...,https://www.aspirantsg.com/a-packet-of-rice-br...,"Low income,Need food support",Food,"Low Income,Food","Needs help to get food, meal, hungry, have not..."
3,Rehabilitation Centre and Respite Centre,"Provides Occupational Therapy, Physiotherapy a...",Abilities Beyond Limitations and Expectations ...,https://chidnast.sirv.com/SchemesSG/able.jpg,https://www.able-sg.org/,"PWD,Elderly,Caregivers","Occupational therapy,Physiotherapy,Educational...","PWD,Family,Transport,Healthcare,Mental Health","rehabilitation, occupational therapy, physioth..."
4,Various Services,We prepare educational programs (IT Training) ...,Acronis Cyber Foundation Program,https://chidnast.sirv.com/SchemesSG/acronis.jpg,https://acronis.org/rehabilitation-of-inmates/,Ex-offenders,"Vocational training,Employment assistance","Ex-offender,Employment","ex-convict need job, ex-convict skills trainin..."


In [6]:
import spacy
import re

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def preprocessing(sentence):
    # Text cleaning steps from spacy_tokenizer
    sentence = re.sub('\'', '', sentence)  # Remove distracting single quotes
    sentence = re.sub(' +', ' ', sentence)  # Replace extra spaces
    sentence = re.sub(r'\n: \'\'.*', '', sentence)  # Remove specific unwanted lines
    sentence = re.sub(r'\n!.*', '', sentence)
    sentence = re.sub(r'^:\'\'.*', '', sentence)
    sentence = re.sub(r'\n', ' ', sentence)  # Replace non-breaking new lines with space

    # Tokenization and further processing with spaCy
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        # Check if the token is a stopword or punctuation
        if token.is_stop or token.is_punct:
            continue
        # Check for numeric tokens or tokens longer than 2 characters
        if token.like_num or len(token.text) > 2:
            # Lemmatize (handling pronouns) and apply lowercase
            lemma = token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
            tokens.append(lemma)

    # Further clean up to remove any introduced extra spaces
    processed_text = ' '.join(tokens)
    processed_text = re.sub(' +', ' ', processed_text)

    return processed_text


In [17]:
# Build desc_booster by concatenating non-null values
def build_desc_booster(row):
    components = []

    # Check each column and add non-null values
    if pd.notna(row['Description']):
        components.append(str(row['Description']))
    if pd.notna(row['search_booster(WL)']):
        components.append(str(row['search_booster(WL)']))
    if pd.notna(row["Who's it for"]):
        components.append(str(row["Who's it for"]))
    if pd.notna(row['What it gives']):
        components.append(str(row['What it gives']))
    if pd.notna(row['Scheme Type']):
        components.append(str(row['Scheme Type']))

    # Join all non-null components with spaces
    return ' '.join(components)

# Apply build_desc_booster to each row and store in desc_booster column
df['desc_booster'] = df.apply(build_desc_booster, axis=1)

# Print a sample row to verify
print("Sample desc_booster value:")
print(df['desc_booster'].iloc[0])

Sample desc_booster value:
Provides casework & counselling; therapeutic groupwork social service, individuals, families, casework, counselling Families Counselling,Casework Low Income,Family


In [18]:
# Get rows where desc_booster is NA
na_rows = df[df['desc_booster'].isna()]
print("Full rows where desc_booster is NA:")
na_rows

Full rows where desc_booster is NA:


Unnamed: 0,Scheme,Description,Agency,Image,Link,Who's it for,What it gives,Scheme Type,search_booster(WL),desc_booster


In [19]:
df['clean_text'] = df['desc_booster'].apply(lambda x: preprocessing(x))
df.head()

Unnamed: 0,Scheme,Description,Agency,Image,Link,Who's it for,What it gives,Scheme Type,search_booster(WL),desc_booster,clean_text
0,Various services,Provides casework & counselling; therapeutic g...,Montfort Care Family Service,https://chidnast.sirv.com/SchemesSG/27fsc.jpg,https://montfortcare.org.sg/services/families/,Families,"Counselling,Casework","Low Income,Family","social service, individuals, families, casewor...",Provides casework & counselling; therapeutic g...,provide casework counselling therapeutic group...
1,Financial Assistance,"Provides care and support, counselling, nutrit...",365 Cancer Prevention Society (365CPS),https://chidnast.sirv.com/SchemesSG/365cps.jpg,https://365cps.org.sg/portfolio/financial-supp...,Cancer patients,Financial assistance for cancer treatment,"Low Income,Healthcare","See doctor, cancer, oncology","Provides care and support, counselling, nutrit...",provide care support counselling nutritional s...
2,Food Assistance,A self setup group which distributes meal box ...,A Packet of Rice,https://chidnast.sirv.com/SchemesSG/apacketofr...,https://www.aspirantsg.com/a-packet-of-rice-br...,"Low income,Need food support",Food,"Low Income,Food","Needs help to get food, meal, hungry, have not...",A self setup group which distributes meal box ...,self setup group distribute meal box low incom...
3,Rehabilitation Centre and Respite Centre,"Provides Occupational Therapy, Physiotherapy a...",Abilities Beyond Limitations and Expectations ...,https://chidnast.sirv.com/SchemesSG/able.jpg,https://www.able-sg.org/,"PWD,Elderly,Caregivers","Occupational therapy,Physiotherapy,Educational...","PWD,Family,Transport,Healthcare,Mental Health","rehabilitation, occupational therapy, physioth...","Provides Occupational Therapy, Physiotherapy a...",provide occupational therapy physiotherapy pil...
4,Various Services,We prepare educational programs (IT Training) ...,Acronis Cyber Foundation Program,https://chidnast.sirv.com/SchemesSG/acronis.jpg,https://acronis.org/rehabilitation-of-inmates/,Ex-offenders,"Vocational training,Employment assistance","Ex-offender,Employment","ex-convict need job, ex-convict skills trainin...",We prepare educational programs (IT Training) ...,prepare educational program training offender ...


### Generate enhanced Document embeddings ###

1. Load pretrained all-mpnet-base-v2 model and tokenizer
2. Compute embedding for clean_text column , perform pooling and normalize embeddings
3. Create FAISS index for embeddings
4. Save model, tokenizer, embeddings, faiss index as files

In [20]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')


  from .autonotebook import tqdm as notebook_tqdm


In [21]:
# Tokenize sentences
encoded_input = tokenizer(df['clean_text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[ 0.0013,  0.0403, -0.0129,  ..., -0.0119,  0.0385,  0.0354],
        [ 0.0284,  0.0846, -0.0271,  ...,  0.0063,  0.0467, -0.0317],
        [ 0.0066,  0.0036,  0.0009,  ...,  0.0060, -0.0200,  0.0050],
        ...,
        [ 0.0151, -0.0083,  0.0042,  ...,  0.0102,  0.0376,  0.0307],
        [ 0.0004,  0.0772,  0.0100,  ..., -0.0212,  0.0140,  0.0238],
        [-0.0095,  0.0274, -0.0101,  ..., -0.0232,  0.0371,  0.0183]])


In [22]:
import faiss
import numpy as np

embeddings = sentence_embeddings
# Convert embeddings to np.float32 as required by FAISS
embeddings = np.array(embeddings).astype('float32')

# Create a FAISS index
dimension = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using the L2 distance for similarity
index.add(embeddings)  # Adding the embeddings to the index


In [29]:
# Assuming `model` is your PyTorch model and `tokenizer` is the Hugging Face tokenizer
model_save_path = './models/schemesv2-torch-allmpp-model'
tokenizer_save_path = './models/schemesv2-torch-allmpp-tokenizer'
embeddings_save_name = './models/schemesv2-your_embeddings.npy'
index_save_name = './models/schemesv2-your_index.faiss'


# Save the embeddings and index to disk
np.save(embeddings_save_name, embeddings)
faiss.write_index(index, index_save_name)

# Save model
model.save_pretrained(model_save_path)

# Save tokenizer
tokenizer.save_pretrained(tokenizer_save_path)


('./models/schemesv2-torch-allmpp-tokenizer/tokenizer_config.json',
 './models/schemesv2-torch-allmpp-tokenizer/special_tokens_map.json',
 './models/schemesv2-torch-allmpp-tokenizer/vocab.txt',
 './models/schemesv2-torch-allmpp-tokenizer/added_tokens.json',
 './models/schemesv2-torch-allmpp-tokenizer/tokenizer.json')

### Sample run

1. Load models, tokenizer, embeddings, faiss index
2. Define split_query_into_needs function to split a query into distinct needs
3. 

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import faiss


model_save_path = './models/schemesv2-torch-allmpp-model'
tokenizer_save_path = './models/schemesv2-torch-allmpp-tokenizer'
embeddings_save_name = './models/schemesv2-your_embeddings.npy'
index_save_name = './models/schemesv2-your_index.faiss'

# Load model and tokenizer at startup
model = AutoModel.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

# Load the embeddings and index
embeddings = np.load(embeddings_save_name)
index = faiss.read_index(index_save_name)

In [25]:
import faiss
import numpy as np

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Now, you can use `index` for similarity searches with new user queries
def search_similar_items(query_text, full_query, top_k=10):

    # preproc = preprocessing(query_text)
    preproc = query_text
    # Compute embedding for the query text
    # query_embedding = model.encode([preproc])

    # Tokenize text
    encoded_input = tokenizer([preproc], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    query_embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    query_embedding = F.normalize(query_embedding, p=2, dim=1)

    query_embedding = np.array(query_embedding).astype('float32')

    # Perform the search
    distances, indices = index.search(query_embedding, top_k)
    similarity_scores =  np.exp(-distances)
    # similar_items = pd.DataFrame([df.iloc[indices[0]], distances[0], similarity_scores[0]])
    # Retrieve the most similar items
    similar_items = df.iloc[indices[0]][['Scheme', 'Agency', 'Description']]

    results = pd.concat([similar_items.reset_index(drop=True), pd.DataFrame(similarity_scores[0]).reset_index(drop=True)], axis=1)
    results = results.set_axis(['Scheme', 'Agency', 'Description', 'Similarity'], axis=1)
    results['query'] = full_query
    results = results.sort_values(['Similarity'], ascending=False)

    return results


In [26]:

def combine_and_aggregate_results(needs, user_query):
    # DataFrame to store combined results
    combined_results = pd.DataFrame(columns=['Scheme', 'Agency', 'Description', 'Similarity', 'query'])

    # Process each need
    for need in needs:
        # Get the results for the current need
        current_results = search_similar_items(need, user_query)
        # Combine with the overall results
        combined_results = pd.concat([combined_results, current_results], ignore_index=True)

    # Handle duplicates: Aggregate similarity for duplicates and drop duplicates
    aggregated_results = combined_results.groupby(['Scheme', 'Agency', 'Description', 'query'], as_index=False).agg({
        'Similarity': 'mean'  # Adjust this function as needed to aggregate similarity scores appropriately
    })

    # Sort by similarity in descending order
    sorted_results = aggregated_results.sort_values(by='Similarity', ascending=False).reset_index(drop=True)

    return sorted_results


In [27]:

def extract_needs_based_on_conjunctions(sentence):
    """Extract distinct needs based on coordinating conjunctions."""
    doc = nlp(sentence)
    needs = []
    current_need_tokens = []

    for token in doc:
        # If the token is a coordinating conjunction (e.g., 'and') and not at the start of the sentence,
        # consider the preceding tokens as one distinct need.
        if token.text.lower() in ['and', 'or'] and token.i != 0:
            if current_need_tokens:  # Ensure there's content before the conjunction
                needs.append(" ".join([t.text for t in current_need_tokens]))
                current_need_tokens = []  # Reset for the next need
        else:
            current_need_tokens.append(token)

    # Add the last accumulated tokens as a need, if any.
    if current_need_tokens:
        needs.append(" ".join([t.text for t in current_need_tokens]))

    return needs

def split_query_into_needs(query):
    """Split the query into sentences and then extract needs focusing on conjunctions."""
    sentences = split_into_sentences(query)
    all_needs = []
    for sentence in sentences:
        needs_in_sentence = extract_needs_based_on_conjunctions(sentence)
        all_needs.extend(needs_in_sentence)
    return all_needs

# Helper function to split the query into sentences
def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Example usage
query = "my client needs a blood pressure monitor"
distinct_needs = split_query_into_needs(query)
print(f"Distinct needs: {distinct_needs}")

print(f"Distinct needs preproc : {[preprocessing(x) for x in distinct_needs]}")


Distinct needs: ['my client needs a blood pressure monitor']
Distinct needs preproc : ['client need blood pressure monitor']


In [28]:
user_query = "My client needs assistance in getting diapers and milk powder as she is from a low-income family which has only one sole breadwinner"
# user_query = "My client needs assistance as a dialysis patient. She is also in need of a job and financial support after COVID 19 has caused her to be retrenched"

split_query = split_query_into_needs(user_query)
# split_query = split_query_into_needs("I am a 31 year old married with one kid in need of more money")
print(split_query)

proc_split_query = [preprocessing(x) for x in split_query]
print(proc_split_query)
# Example usage
final_results = combine_and_aggregate_results(proc_split_query, user_query)
final_results

['My client needs assistance in getting diapers', 'milk powder as she is from a low - income family which has only one sole breadwinner']
['client need assistance get diaper', 'milk powder low income family sole breadwinner']


  combined_results = pd.concat([combined_results, current_results], ignore_index=True)


Unnamed: 0,Scheme,Agency,Description,query,Similarity
0,Financial Assistance,Muslimin Trust Fund Association,MTFA provides cash assistance given on a bi-mo...,My client needs assistance in getting diapers ...,0.345592
1,North East Growth Fund,North East CDC,The North East Growth Fund was set up to provi...,My client needs assistance in getting diapers ...,0.331537
2,ONE Fresh Food Programme,ONE Singapore,The ONE Fresh Food Programme provides healthy ...,My client needs assistance in getting diapers ...,0.325691
3,Mutual Help and Care,Bo Tien Welfare Services Society,The food ration programme is tasked as additio...,My client needs assistance in getting diapers ...,0.317856
4,Family Development Programme,Methodist Welfare Services (MWS),Monthly financial assistance to help families ...,My client needs assistance in getting diapers ...,0.316161
5,Food Care Programme,Cornerstone Community Services,Food Care Programme provides food and daily ne...,My client needs assistance in getting diapers ...,0.313127
6,Financial Assistance,BCARE,Provide short-term financial assistance for mi...,My client needs assistance in getting diapers ...,0.311132
7,Matthew 25 Soup Kitchen,Church of the Nativity,Free meals - Operates 6 days a week (excluding...,My client needs assistance in getting diapers ...,0.305568
8,Financial Assistance,The Breadline Group,"Every month, our volunteers visit each “adopte...",My client needs assistance in getting diapers ...,0.302336
9,ComCare Interim Assistance,Ministry of Social and Family Development (MSF),Provides immediate financial assistance for lo...,My client needs assistance in getting diapers ...,0.296682


### Testing the existing queries data

In [19]:
dfq = pd.read_csv("../raw_data/schemes-queries.csv")
dfq.describe()

FileNotFoundError: [Errno 2] No such file or directory: '../raw_data/schemes-queries.csv'

In [16]:

import pandas as pd

dfq_unique = dfq.drop_duplicates(subset=["Query"])
dfq_unique = dfq_unique.dropna(subset=["Query"])

# Assume you have a list of queries to test
queries = dfq_unique['Query']

# Initialize an empty DataFrame for concatenating all results
all_results_df = pd.DataFrame()

for query in queries:
    # Perform the search
    split_query = split_query_into_needs(query)
    proc_split_query = [preprocessing(x) for x in split_query]
    results = combine_and_aggregate_results(proc_split_query, query)
    top_10 = results.head(10)

    # Concatenate the current results with the overall results DataFrame
    all_results_df = pd.concat([all_results_df, top_10], ignore_index=True)


# Save the overall results DataFrame to a CSV file
all_results_df.to_csv('../raw_data/overall_search_results_transformers_laiss.csv', index=False)

print("All search results saved to 'overall_search_results_transformers_laiss.csv'")

  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_results, current_results], ignore_index=True)
  combined_results = pd.concat([combined_re

All search results saved to 'overall_search_results_transformers_laiss.csv'
