# Save the best embeddings

Save the best embeddings based on the results from 4_text_embeddings.ipynb notebook

In [2]:
%%capture
pip install -r ../requirements.txt

In [3]:
# Install the required packages
import numpy as np
import pandas as pd

# NLP
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Need this set to None otherwise text columns will truncate!
pd.set_option('display.max_colwidth', 100) 

In [4]:
import sys 
sys.path.append("../scripts/")

from access_data import authenticate_google_drive, grab_google_drive_folder_data

drive = authenticate_google_drive()
# text_data = grab_google_drive_folder_data(drive=drive,filename="reddit_filtered_data.csv")
combined_labeled_data.csv


Successfully loaded 'reddit_filtered_data.csv' into a DataFrame!


In [5]:
text_data.isna().sum()

submission_id             0
subreddit_topic           0
search_query              0
title                     0
text                    169
url                       0
score                     0
num_comments              0
username                  0
created_at                0
data_pull_date            0
days_since_post_date      0
combine_text              0
flag_picture_posts        0
dtype: int64

In [6]:
print(f"Number of rows: {text_data.shape[0]}") 

Number of rows: 4161


In [7]:
def create_embedding_dataframes(post_dataset, model_list, col_to_encode='combine_text'):

    dataset = post_dataset.copy().reset_index(drop=True)
    text_to_encode = post_dataset[col_to_encode]
    
    dfs = []

    for model in model_list:
        print('Started with model ' + model)
        load_model = SentenceTransformer(model, device="cpu") 
        post_embedding = load_model.encode(text_to_encode.values)
        data_with_embdedding = pd.concat([dataset, pd.DataFrame(post_embedding)], axis = 1)
        dfs.append(data_with_embdedding)   

        print('Done with model: ' + model)

    return dfs


# Models to try. Based off hugging face leaderboard (https://huggingface.co/spaces/mteb/leaderboard) and our impact analysis
# model_names = ["bert-base-nli-mean-tokens", "multi-qa-mpnet-base-dot-v1", "BAAI/bge-m3"]
model_names = ["bert-base-nli-mean-tokens", 'BAAI/bge-small-en-v1.5']
data = text_data.copy()


embedded_dfs = create_embedding_dataframes(text_data, model_names)

Started with model bert-base-nli-mean-tokens


KeyboardInterrupt: 

In [19]:
# Save each model's embeddings to drive here.
# This will overwrite data in drive - be sure before running. 
from access_data import save_google_drive_data

# Grab the Google Drive object
drive = authenticate_google_drive()
model_file_names = []
for model in model_names: 
    model = model.replace('/', '-').replace('-', '_')
    model = model+'.csv'
    model_file_names.append(model)

for i, k in zip(model_file_names, embedded_dfs):
    
    dataset_name = 'data_encoded_'+i

    # Save the data in the Google Drive location
    save_google_drive_data(drive=drive, 
                           dataframe = k, 
                           filename=dataset_name)


File 'data_encoded_bert_base_nli_mean_tokens.csv' uploaded successfully to folder 1Ktcv4eaR7kH0teyGuLph4LSYWxI1qkIS!
File 'data_encoded_multi_qa_mpnet_base_dot_v1.csv' uploaded successfully to folder 1Ktcv4eaR7kH0teyGuLph4LSYWxI1qkIS!
File 'data_encoded_BAAI_bge_m3.csv' uploaded successfully to folder 1Ktcv4eaR7kH0teyGuLph4LSYWxI1qkIS!
