In [6]:
%%capture
pip install -r ../../requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
# Install the required packages
import sys 
import requests #needed to perform HTTP GET and POST requests
import numpy as np
import pandas as pd

# NLP
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Need this set to None otherwise text columns will truncate!
pd.set_option('display.max_colwidth', None) 

In [8]:
# set this on the path so that we can reference the commong data locations
sys.path.append("../../scripts/")
from process_text_data import text_embeddings, compute_similarity_scores

In [9]:
text_data = pd.read_csv('../../data/posts.csv')

In [10]:
def create_embedding_dataframes(post_dataset, model_list, col_to_encode='text'):

    dataset = post_dataset.copy().reset_index()
    
    # Specify which text columns should be encoded.
    text_to_encode = post_dataset[col_to_encode].fillna("").astype(str)
    
    dfs = []

    # Iterate over models
    for model in model_list:
    
        print('Started with model ' + model)
    
        # Load in the model
        load_model = SentenceTransformer(model, device="cpu") 

        # text_to_encode = post_dataset[col_to_encode].fillna(post_dataset[col_to_encode])
    
        # Embed th posts
        post_embedding = load_model.encode(text_to_encode.values)
    
        # Store embeddings with that data's info
        data_with_embdedding = pd.concat([dataset, pd.DataFrame(post_embedding)], axis = 1)
    
        # Save embeddings as list of dfs
        dfs.append(data_with_embdedding)   

        print('Done with model: ' + model)

    return dfs

 
# Models to try. Based off hugging face leaderboard (https://huggingface.co/spaces/mteb/leaderboard) and our impact analysis
model_names = ["multi-qa-mpnet-base-dot-v1", "BAAI/bge-m3"]
data = text_data.copy()

embedded_dfs = create_embedding_dataframes(text_data.sample(10, random_state = 0), model_names)

Started with model multi-qa-mpnet-base-dot-v1
Done with model: multi-qa-mpnet-base-dot-v1
Started with model BAAI/bge-m3
Done with model: BAAI/bge-m3


In [11]:
embedded_dfs[0].sample()

Unnamed: 0,index,submission_id,subredit_topic,search_query,title,text,score,num_comments,username,created_at,...,758,759,760,761,762,763,764,765,766,767
8,4487,1dspvjg,personalfinance,Wells Fargo,Wells Fargo Super checks question,"Am I understanding this correctly, I can just write a check to myself (even up to the enitre credit limit) and use it to pay whatever. And it counts as a balance transfer? So 3% loan? This seems too good to be true, I must be missing something?",3,5,ParticularClean9568,2024-07-01 10:40:56,...,0.081928,0.249572,-0.068738,-0.133499,-0.320599,0.506896,-0.020875,-0.015559,-0.086569,0.012423


In [10]:
# Save each model's embeddings to drive here.
# This will overwrite data in drive - be sure before running. 

# # Grab the Google Drive object
# drive = authenticate_google_drive('../0_data_collection/credentials/google_drive_client_secret.json')
# model_file_names = ["multi-qa-mpnet-base-dot-v1", "BAAI/bge-m3"]

# for i, k in zip(model_file_names, embedded_dfs):
    
#     dataset_name = 'data_encoded_'+i

#     # # Save the data in the Google Drive location
#     save_google_drive_data(drive=drive, 
#                            credential_file="../0_data_collection/credentials/google_drive_folder_id.json",  
#                            dataframe = k, 
#                            filename=dataset_name)

#     path = '../../data/'+dataset_name+'.csv'


File 'data_encoded_multi-qa-mpnet-base-dot-v1' uploaded successfully to folder 1kJ6TrI9MVT5mfnnYvS-OpRMJFVbIQ6Tl!
File 'data_encoded_BAAI/bge-m3' uploaded successfully to folder 1kJ6TrI9MVT5mfnnYvS-OpRMJFVbIQ6Tl!
