# DATA SELECTION - SEMANTIC RETRIEVAL WITH SENTENCE TRANSFORMERS

In [None]:
%pip install transformers sentence-transformers vaderSentiment

In [2]:
import pandas as pd
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


After filtering, there ~49k posts and comments in the dataset. Next, we will select the most relevant records that express sentiments about OpenAI, and filter out low quality data. This step will enable us to produce a high quality dataset for company reputation analysis.

Prior to using embedding-based semantic search, we experimented with TF-IDF-based retrieval, to find the most relevant records, i.e, the records with the highest cosine similarity to a given query (when using TF-IDF vectorization). However, upon manually labelling ~450 of the most relevant records selected using TF-IDF, we found that ~41% of the records were irrelevant, i.e, they express no positive/negative/neutral sentiment about OpenAI.

This is primarily because term-based vectorization methods like TF-IDF do not represent the semantic meaning of the data. Therefore, we decided to experiment with using embedding models with the Sentence Transformers library, which are specialized for conducting semantic retrieval of the most relevant data points, using cosine similarity.

We are utilizing the msmarco-distilbert-cos-v5 model as the embedding model for the following reasons:
1. As visualized during exploratory data analysis, our "passages" (comments and posts) are generally longer than the length of the queries we will be using for retrieval (see below). Therefore, we require a model for asymmetric semantic search (where the query is generally shorter in length than the passages to be retrieved). The [Sentence Transformer documentation](https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search) recommends models trained on the MS-MARCO information retrieval dataset, for asymmetric semantic search. 

2. DistilBERT is a smaller, lighter version of BERT that maintains most of the original performance. It is used as the backbone of this embedding model. Therefore, it will be efficient and quick to retrieve relevant examples from our dataset. 

3. The model performs relatively well compared to other Sentence Transformers on various [information retrieval benchmarks](https://www.sbert.net/docs/pretrained-models/msmarco-v5.html#performance).

In [3]:
# Load the embedding model
embedding_model = SentenceTransformer("msmarco-distilbert-cos-v5")

In [27]:
# Read the filtered data
filtered_data = pd.read_csv("../Data/filtered_data.csv")

In [28]:
# Define multiple search queries, corresponding to each sentiment label, to help
# retrieve a balanced dataset
queries = ["What do users think about OpenAI’s ChatGPT, DALL·E, and other AI tools?",
           "How well do OpenAI’s models perform according to user reviews?",
           "Comparison of OpenAI's products and other competitors based on user reviews",
           "Criticism and complaints about OpenAI’s products in user reviews",
           "Customer satisfaction and positive experiences with OpenAI products"]

In [29]:
# Extract the text column of filtered_data as a list 
reviews = filtered_data["text"].values.tolist()
reviews

["ChatGPT Caused 'Code Red' at Google, Report Says ",
 'how could someone use ChatGPT or other "AI" services to make some side money? I\'m just looking to make some extra cash on the side without having to get ANOTHER job and wondering what sorts of side hustles I could potentially work at.',
 'Chat gpt won’t verify my phone number Hello guys when i try to log in to chat gpt it tells me to add a phone number to verify my account but when i do and i click on send code it says error   Tried it with multiple phone numbers and now i am stuck on you have sent many codes try again later  Anyone had this issue ?',
 'Is it possible to use Whisper function for zoom meetings Hello, I am a recent Mathematics undergraduate who has been playing recently with ChatGPT3. I came up with the idea of implementing the whisper function to zoom meetings to create an app that allows the user to store transcripts of their zoom meetings (which can be further transformed to summary of it generated by chatgpt3).

In [30]:
# Generate embeddings for the queries
query_embeddings = embedding_model.encode(queries, convert_to_tensor=True)

In [None]:
# Generate embeddings for the reviews
review_embeddings = embedding_model.encode(reviews, convert_to_tensor=True)

In [31]:
# Perform cosine similarity search between the queries and reviews embeddings, and retrieve the top 5000 most similar reviews, for each query
retrieved_reviews = util.semantic_search(query_embeddings, review_embeddings, top_k = 1)

In [32]:
retrieved_reviews

[[{'corpus_id': 1, 'score': 0.42896950244903564}],
 [{'corpus_id': 1, 'score': 0.13613244891166687}],
 [{'corpus_id': 1, 'score': 0.19953207671642303}],
 [{'corpus_id': 0, 'score': 0.21180322766304016}],
 [{'corpus_id': 1, 'score': 0.14177760481834412}]]

In [34]:
# Create a dictionary to store the highest score for each unique id
# from the results of all the queries
unique_reviews = {}

for review_list in retrieved_reviews:
    for review in review_list:
        corpus_id = review['corpus_id']
        score = review['score']
        if corpus_id not in unique_reviews or score > unique_reviews[corpus_id]:
            unique_reviews[corpus_id] = score

unique_reviews

{1: 0.42896950244903564, 0: 0.21180322766304016}

In [None]:
# Modify the filtered_data DataFrame to include a new column for the cosine similarity score
# for each unique id
filtered_data['cosine_similarity'] = filtered_data.index.map(unique_reviews.get)

Unnamed: 0,post_id,subreddit,post_title,post_body,number_of_comments,readable_datetime,post_author,number_of_upvotes,query,text,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,cosine_similarity
0,1002dom,technology,"ChatGPT Caused 'Code Red' at Google, Report Says",,370,01/01/2023 0:03,slakmehl,792,ChatGPT,"ChatGPT Caused 'Code Red' at Google, Report Says",...,,,,,,,,,,0.211803
1,101o6zx,singularity,"how could someone use ChatGPT or other ""AI"" se...",I'm just looking to make some extra cash on th...,6,03/01/2023 1:25,theferalturtle,4,gpt OR GPT or Gpt,"how could someone use ChatGPT or other ""AI"" se...",...,,,,,,,,,,0.42897
2,101p00n,OpenAI,Chat gpt won’t verify my phone number,Hello guys when i try to log in to chat gpt it...,73,03/01/2023 1:56,T-boner970,3,coding with ChatGPT,Chat gpt won’t verify my phone number Hello gu...,...,,,,,,,,,,
3,102ci8x,OpenAI,Is it possible to use Whisper function for zoo...,"Hello, I am a recent Mathematics undergraduate...",11,03/01/2023 20:34,LoanOne2968,1,Whisper,Is it possible to use Whisper function for zoo...,...,,,,,,,,,,
4,102jcse,OpenAI,How could ChatGPT replace Google Search for ma...,,9,04/01/2023 1:03,keyhell,1,ChatGPT search,How could ChatGPT replace Google Search for ma...,...,,,,,,,,,,


In [None]:
# Sort the data based on the cosine similarity, and drop rows with NaN values (which were not retrieved by the semantic search)
filtered_data = filtered_data.dropna(subset=['cosine_similarity'])
filtered_data = filtered_data.sort_values('cosine_similarity', ascending=False)

In [38]:
# Display the first few rows of the selected data
filtered_data.head()

Unnamed: 0,post_id,subreddit,post_title,post_body,number_of_comments,readable_datetime,post_author,number_of_upvotes,query,text,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,cosine_similarity
1,101o6zx,singularity,"how could someone use ChatGPT or other ""AI"" se...",I'm just looking to make some extra cash on th...,6,03/01/2023 1:25,theferalturtle,4,gpt OR GPT or Gpt,"how could someone use ChatGPT or other ""AI"" se...",...,,,,,,,,,,0.42897
0,1002dom,technology,"ChatGPT Caused 'Code Red' at Google, Report Says",,370,01/01/2023 0:03,slakmehl,792,ChatGPT,"ChatGPT Caused 'Code Red' at Google, Report Says",...,,,,,,,,,,0.211803


In [39]:
filtered_data.describe()

Unnamed: 0,number_of_comments,number_of_upvotes,comment_id,comment_body,comment_author,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,cosine_similarity
count,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
mean,188.0,398.0,,,,,,,,,,,,,,,0.320386
std,257.386868,557.200144,,,,,,,,,,,,,,,0.15356
min,6.0,4.0,,,,,,,,,,,,,,,0.211803
25%,97.0,201.0,,,,,,,,,,,,,,,0.266095
50%,188.0,398.0,,,,,,,,,,,,,,,0.320386
75%,279.0,595.0,,,,,,,,,,,,,,,0.374678
max,370.0,792.0,,,,,,,,,,,,,,,0.42897


In [None]:
# Save the retrieved data to a new CSV file
filtered_data.to_csv('../Data/selected_data.csv', index=False)