#### Overall flow
```
Generated embeddings for the hotels posts using a sentence BERT model, 
Calculates the centriods for the truthful and deceptive embeddings from the training set
Assigns labels to the test set data based on the similarity to the centroids
```

In [12]:
import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

from sentence_transformers import SentenceTransformer
model_name = 'all-distilroberta-v1'  # We chose this model because the sequence length is 512
# model_name = 'all-MiniLM-L6-v2'  # alter model with sequence length of 512
model = SentenceTransformer(model_name)
print(f"Max Sequence Length for model, {model_name}: {model.max_seq_length}")

sentence_transformer_ef = SentenceTransformerEmbeddingFunction(model_name=model_name)

print(f'Default embedding function: {sentence_transformer_ef}')
client = chromadb.Client(Settings(persist_directory="./spam-db")) # Does persistence work??


# collection = client.create_collection(name="spam-dataset")
collection = client.create_collection(
        name="spam-dataset",
        embedding_function=sentence_transformer_ef,
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )
print(f'collection count: {collection.count()}')

Max Sequence Length for model, all-distilroberta-v1: 512


Using embedded DuckDB without persistence: data will be transient


Default embedding function: <chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction object at 0x7fe5307050a0>
collection count: 0


In [13]:
import os
import sys
import numpy as np

# Add the parent directory of the project to the sys.path
project_dir = os.path.abspath(os.path.join(os.path.dirname('.'), ".."))
sys.path.append(project_dir)

from utils.read_gold_dataset import collect_all_files, read_a_file, get_gt_from_file_name, filter_files

root_dir = '../spam-dataset/op_spam_v1.4'
file_path_list = collect_all_files(root_dir, 'truthful')
file_path_list += collect_all_files(root_dir, 'deceptive')
total_samples = len(file_path_list)
print(f'Total samples: {total_samples}')



Total samples: 1600


In [14]:
def get_embedding(file_path):
    _, veracity, text = read_a_file(file_path)
    # print(f'read file in get_embedding: {text}')
    # sentence_transformer_ef([text])
    return sentence_transformer_ef([text]), veracity

def get_mean_embedding(file_path_list):
    embeddings = []
    for i, file_path in enumerate(file_path_list):
        _, _, text = read_a_file(file_path)
        # print(f'index {i}: file name: {file_path}: text\n {text}')
        embeddings.append(sentence_transformer_ef([text])[0])
    return np.mean(embeddings, axis=0)

In [15]:
# test/train split
from sklearn.model_selection import train_test_split
import pprint

# Splitting the dataset into training and testing sets
train_files, test_files = train_test_split(file_path_list, test_size=0.20, random_state=42, shuffle=True)

train_files_truthful = filter_files(train_files, 'truthful')
print(f'Count of truthful training files: {len(train_files_truthful)}')

train_files_deceptive = filter_files(train_files, 'deceptive')
print(f'Count of deceptive training files: {len(train_files_deceptive)}')

truthful_mean_embedding = get_mean_embedding(train_files_truthful)
deceptive_mean_embedding = get_mean_embedding(train_files_deceptive)

# get the top 5 nearest neighbours to each of the means
# top_5 = collection.query([truthful_mean_embedding], n_results=5)
# print(top_5)
# for i in range(5):
#     print(f"ids: {top_5['ids'][0][i]}, centroid embedding: truthful")
#     pprint.pprint(top_5['documents'][0][i])
#     print(f'{"-"*20}')

# top_5 = collection.query([deceptive_mean_embedding], n_results=5)
# for i in range(5):
#     print(f"ids: {top_5['ids'][0][i]}, centroid embedding: deceptive")
#     pprint.pprint(top_5['documents'][0][i])
#     print(f'{"-"*20}')


# # Printing the results
# print(f'Train file count: {len(train_files)}, Training Data: {train_files}')
# print(f'Test file count: {len(test_files)} Testing Data: {test_files}')

Count of truthful training files: 629
Count of deceptive training files: 651


In [16]:
# do this only once
base = 100
for i, file_path in enumerate(train_files):
    gt_sentiment, gt_veracity, text = read_a_file(file_path)
    ids = ['id' + str(i + base) + gt_veracity ]
    metadatas = [{"source": file_path}]
    # print(ids)
    # print(metadatas)
    collection.add(
        documents = [text],
        metadatas = metadatas,
        ids = ids
    )
print(f'count: {collection.count()}')

count: 1280


In [17]:
# collection.get()
# Only get documents and ids
retrieve = collection.get(include=[ "embeddings" ])
mean_embedding_of_set = np.mean(retrieve["embeddings"], axis=0)
print(f'Dimension of embedding: {mean_embedding_of_set.shape}')
# print(mean_embedding_of_set)  

Dimension of embedding: (768,)


##### Assign class to be the same as nearest neighbor

In [18]:
import pprint

def get_nearest_neighbor(test_file_name):
    _, gt_veracity, text = read_a_file(test_file_name)
    # pprint.pprint(text)
    # print(f'veracity: {gt_veracity}')
    results = collection.query(
        query_texts= text,
        n_results=1
    )
    # pprint.pprint(results)
    file_nearest_neighbor = results['metadatas'][0][0]['source'] 
    # print(file_nearest_neighbor)
    # extract ground_truth from spam file name
    _, gt_veracity_of_nearest_neighbor = get_gt_from_file_name(file_nearest_neighbor)
    print(f'GT of post:{gt_veracity}, GT of nearest neighbor: {gt_veracity_of_nearest_neighbor}')
    return gt_veracity, gt_veracity_of_nearest_neighbor

test_file_name = '../spam-dataset/op_spam_v1.4/negative_polarity/truthful_from_Web/fold2/t_affinia_11.txt'
_,_ = get_nearest_neighbor(test_file_name)

GT of post:truthful, GT of nearest neighbor: truthful


In [19]:
gt_veracity_list = []
gt_veracity_of_nearest_neighbor_list = []
for test_file in test_files:
    gt_veracity, gt_veracity_of_nearest_neighbor = get_nearest_neighbor(test_file)
    gt_veracity_list.append(gt_veracity)
    gt_veracity_of_nearest_neighbor_list.append(gt_veracity_of_nearest_neighbor)


GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: deceptive
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:deceptive, GT of nearest neighbor: deceptive
GT of post:tru

In [20]:
# get a F1-score
from sklearn.metrics import f1_score

# Calculate F1 score
# f1 = f1_score(gt_veracity_list, gt_veracity_of_nearest_neighbot_list)
f1_weighted = f1_score(gt_veracity_list, gt_veracity_of_nearest_neighbor_list, average='weighted')
print("Weighted F1 Score:", f1_weighted)

print("F1 Score:", f1_weighted)


Weighted F1 Score: 0.6584461952325571
F1 Score: 0.6584461952325571


#### Assign label based on the L2 distance to the centroid of the two classes

In [21]:
# classify based on centroids
from sklearn.metrics.pairwise import euclidean_distances

gt_veracity_list = []
pred_veracity_list = []
for test_file in test_files:
    embedding, gt_veracity = get_embedding(test_file)
    gt_veracity_list.append(gt_veracity)
    distance_2_truth = euclidean_distances(embedding, [truthful_mean_embedding])
    distance_2_deception = euclidean_distances(embedding, [deceptive_mean_embedding])
    if distance_2_truth < distance_2_deception:
        pred_veracity = 'truthful'
    else:
        pred_veracity = 'deceptive'
    pred_veracity_list.append(pred_veracity)
    print(f"GT: {gt_veracity}: Predicted: {pred_veracity}\
          \tL2 to truth: {distance_2_truth[0][0]:0.3f}, L2 to deception: {distance_2_deception[0][0]:0.3f}\
           {test_file}")

# Calculate F1 score
f1_weighted = f1_score(gt_veracity_list, pred_veracity_list, average='weighted')
print(f'Weighted F1 Score: {f1_weighted:0.3f}')


GT: truthful: Predicted: truthful          	L2 to truth: 0.590, L2 to deception: 0.617           ../spam-dataset/op_spam_v1.4/negative_polarity/truthful_from_Web/fold3/t_hyatt_16.txt
GT: truthful: Predicted: deceptive          	L2 to truth: 0.776, L2 to deception: 0.762           ../spam-dataset/op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor/fold4/t_swissotel_18.txt
GT: truthful: Predicted: truthful          	L2 to truth: 0.733, L2 to deception: 0.750           ../spam-dataset/op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor/fold5/t_amalfi_15.txt
GT: truthful: Predicted: truthful          	L2 to truth: 0.745, L2 to deception: 0.759           ../spam-dataset/op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor/fold3/t_omni_1.txt
GT: deceptive: Predicted: deceptive          	L2 to truth: 0.538, L2 to deception: 0.521           ../spam-dataset/op_spam_v1.4/positive_polarity/deceptive_from_MTurk/fold3/d_fairmont_16.txt
GT: deceptive: Predicted: truthful          	L2 t

#### Assign label based on the cosine similarity to the centroid of the two classes

In [22]:
# classify based on centroids
from sklearn.metrics.pairwise import cosine_similarity

gt_veracity_list = []
pred_veracity_list = []
for test_file in test_files:
    embedding, gt_veracity = get_embedding(test_file)
    gt_veracity_list.append(gt_veracity)
    similarity_2_truth = cosine_similarity(embedding, [truthful_mean_embedding])  # The higher the value, the more similar the points are
    similarity_2_deception = cosine_similarity(embedding, [deceptive_mean_embedding])
    # print(f'similarity to truth mean: {similarity_2_truth}')
    # print(f'similarity to deception mean: {similarity_2_deception}')
    if similarity_2_truth > similarity_2_deception:
        pred_veracity = 'truthful'
    else:
        pred_veracity = 'deceptive'
    pred_veracity_list.append(pred_veracity)
    print(f"GT: {gt_veracity}: Predicted: {pred_veracity} \
          \tThe similarity to truth is: {similarity_2_truth[0][0]:0.3f} and to deception is: {similarity_2_deception[0][0]:0.3f}")

# Calculate F1 score
f1_weighted = f1_score(gt_veracity_list, pred_veracity_list, average='weighted')
print(f"Weighted F1 Score: {f1_weighted:0.3f}")


GT: truthful: Predicted: truthful           	The similarity to truth is: 0.813 and to deception is: 0.789
GT: truthful: Predicted: deceptive           	The similarity to truth is: 0.636 and to deception is: 0.652
GT: truthful: Predicted: truthful           	The similarity to truth is: 0.681 and to deception is: 0.665
GT: truthful: Predicted: truthful           	The similarity to truth is: 0.669 and to deception is: 0.656
GT: deceptive: Predicted: deceptive           	The similarity to truth is: 0.854 and to deception is: 0.863
GT: deceptive: Predicted: truthful           	The similarity to truth is: 0.554 and to deception is: 0.530
GT: deceptive: Predicted: deceptive           	The similarity to truth is: 0.801 and to deception is: 0.824
GT: truthful: Predicted: truthful           	The similarity to truth is: 0.751 and to deception is: 0.715
GT: truthful: Predicted: truthful           	The similarity to truth is: 0.865 and to deception is: 0.851
GT: truthful: Predicted: truthful       