In [51]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-distilroberta-v1")

print(f'Default embedding function: {sentence_transformer_ef}')
client = chromadb.Client(Settings(persist_directory="./spam-db")) # Does persistence work??


# collection = client.create_collection(name="spam-dataset")
collection = client.create_collection(
        name="spam-dataset",
        embedding_function=sentence_transformer_ef,
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )

Using embedded DuckDB without persistence: data will be transient


Default embedding function: <chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction object at 0x7f37f2073700>


In [52]:
import os
import sys
import numpy as np

# Add the parent directory of the project to the sys.path
project_dir = os.path.abspath(os.path.join(os.path.dirname('.'), ".."))

sys.path.append(project_dir)


from utils.read_gold_dataset import collect_all_files, read_a_file, get_gt_from_file_name, filter_files

root_dir = '../spam-dataset/op_spam_v1.4'
file_path_list = collect_all_files(root_dir, 'truthful')
file_path_list += collect_all_files(root_dir, 'deceptive')
total_samples = len(file_path_list)
print(f'Total samples: {total_samples}')



Total samples: 1600


In [53]:
def get_embedding(file_path):
    _, veracity, text = read_a_file(file_path)
    # print(f'read file in get_embedding: {text}')
    # sentence_transformer_ef([text])
    return sentence_transformer_ef([text]), veracity

def get_mean_embedding(file_path_list):
    print(f'length of  file_path_list: {len(file_path_list)}')
    embeddings = []
    for i, file_path in enumerate(file_path_list):
        _, _, text = read_a_file(file_path)
        # print(f'index {i}: file name: {file_path}: text\n {text}')
        embeddings.append(sentence_transformer_ef([text])[0])
    return np.mean(embeddings, axis=0)

In [54]:
# test/train split
# Import necessary libraries
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets
train_files, test_files = train_test_split(file_path_list, test_size=0.20, random_state=42, shuffle=True)
train_files_truthful = filter_files(train_files, 'truthful')
train_files_deceptive = filter_files(train_files, 'deceptive')

truthful_mean_embedding = get_mean_embedding(train_files_truthful)
deceptive_mean_embedding = get_mean_embedding(train_files_deceptive)

# # Printing the results
# print(f'Train file count: {len(train_files)}, Training Data: {train_files}')
# print(f'Test file count: {len(test_files)} Testing Data: {test_files}')

length of  file_path_list: 629
length of  file_path_list: 651


In [55]:
# do this only once
base = 100
for i, file_path in enumerate(train_files):
    gt_sentiment, gt_veracity, text = read_a_file(file_path)
    ids = ['id' + str(i + base) ]
    metadatas = [{"source": file_path}]
    # print(ids)
    # print(metadatas)
    collection.add(
        documents = [text],
        metadatas = metadatas,
        ids = ids
    )
print(f' count: {collection.count()}')

 count: 1280


In [56]:
# collection.get()
# Only get documents and ids
retrieve = collection.get(include=[ "embeddings" ])
print(np.mean(retrieve["embeddings"], axis=0))  # mean of the entire set

[-3.20832816e-03 -2.45070084e-04  1.31912966e-04 -3.78276987e-02
  9.61867975e-03  3.82074828e-02  1.79261490e-02 -5.79305427e-02
  1.82731184e-02 -1.64145699e-02 -1.66191044e-02 -1.48169902e-02
 -2.65283101e-03 -1.21896510e-02 -4.36431332e-02 -2.09264752e-02
 -4.83074484e-02  1.56539022e-02 -1.76374580e-02  4.61146354e-03
 -5.57032770e-04  2.80260105e-02 -1.25254576e-02  1.68680328e-02
 -6.94187994e-03 -1.95102422e-02  2.98285566e-02  1.50280020e-02
  3.40084782e-03 -7.64126260e-03  3.81063243e-02  1.88356510e-02
  2.75709266e-02  9.41512472e-03 -3.12188016e-02  6.32779991e-03
 -3.87499304e-02 -1.99834276e-02  2.06393819e-02  1.00861219e-02
  2.08237042e-02 -4.16354464e-03  1.76348793e-02  1.12485659e-02
  9.90359731e-03 -4.70307255e-02  1.08582842e-02  1.05462384e-02
  9.82743147e-05  3.06319097e-02 -3.20306138e-03 -1.45975059e-02
  3.19844505e-02 -3.70746171e-02 -9.16430119e-03 -8.06339124e-03
  2.72845396e-02  1.07085384e-03 -1.19046170e-02  6.38449258e-04
  4.77756271e-03  1.58275

In [57]:
import pprint

def get_nearest_neighbor(test_file_name):
    _, gt_veracity, text = read_a_file(test_file_name)
    # pprint.pprint(text)
    # print(f'veracity: {gt_veracity}')
    results = collection.query(
        query_texts= text,
        n_results=1
    )
    # pprint.pprint(results)
    file_nearest_neighbor = results['metadatas'][0][0]['source'] 
    # print(file_nearest_neighbor)
    # extract ground_truth from spam file name
    _, gt_veracity_of_nearest_neighbor = get_gt_from_file_name(file_nearest_neighbor)
    print(f'GT of post:{gt_veracity}, GT of nearest neighbor: {gt_veracity_of_nearest_neighbor}')
    return gt_veracity, gt_veracity_of_nearest_neighbor

test_file_name = '../spam-dataset/op_spam_v1.4/negative_polarity/truthful_from_Web/fold2/t_affinia_11.txt'
get_nearest_neighbor(test_file_name)

GT of post:truthful, GT of nearest neighbor: truthful


('truthful', 'truthful')

In [58]:
gt_veracity_list = []
gt_veracity_of_nearest_neighbor_list = []
for test_file in test_files:
    gt_veracity, gt_veracity_of_nearest_neighbor = get_nearest_neighbor(test_file)
    gt_veracity_list.append(gt_veracity)
    gt_veracity_of_nearest_neighbor_list.append(gt_veracity_of_nearest_neighbor)


GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:deceptive, GT of nearest neighbor: deceptive
GT of post:deceptive, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: truthful
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:truthful, GT of nearest neighbor: deceptive
GT of post:deceptive, GT of nearest neighbor: deceptive
GT of post:tru

In [59]:
# get a F1-score
from sklearn.metrics import f1_score

# Calculate F1 score
# f1 = f1_score(gt_veracity_list, gt_veracity_of_nearest_neighbot_list)
f1_weighted = f1_score(gt_veracity_list, gt_veracity_of_nearest_neighbor_list, average='weighted')
print("Weighted F1 Score:", f1_weighted)

print("F1 Score:", f1_weighted)


Weighted F1 Score: 0.6584461952325571
F1 Score: 0.6584461952325571


In [66]:
# classify based on centroids
from sklearn.metrics.pairwise import euclidean_distances

gt_veracity_list = []
pred_veracity_list = []
for test_file in test_files:
    embedding, gt_veracity = get_embedding(test_file)
    gt_veracity_list.append(gt_veracity)
    distance_2_truth = euclidean_distances(embedding, [truthful_mean_embedding])
    distance_2_deception = euclidean_distances(embedding, [deceptive_mean_embedding])
    if distance_2_truth < distance_2_deception:
        pred_veracity = 'truthful'
    else:
        pred_veracity = 'deceptive'
    pred_veracity_list.append(pred_veracity)
    print(f"gr: {gt_veracity}: pred: {pred_veracity} The L2 to truth is: {distance_2_truth[0][0]} and to deception is: {distance_2_deception[0][0]}")

# Calculate F1 score
f1_weighted = f1_score(gt_veracity_list, pred_veracity_list, average='weighted')
print("Weighted F1 Score:", f1_weighted)

print("F1 Score:", f1_weighted)

gr: truthful: pred: truthful The L2 to truth is: 0.5897091516153223 and to deception is: 0.6170163170493819
gr: truthful: pred: deceptive The L2 to truth is: 0.7764709284866239 and to deception is: 0.762446985881593
gr: truthful: pred: truthful The L2 to truth is: 0.7328517686909526 and to deception is: 0.749723778449425
gr: truthful: pred: truthful The L2 to truth is: 0.7446753487740614 and to deception is: 0.7587364898005041
gr: deceptive: pred: deceptive The L2 to truth is: 0.5382422330284976 and to deception is: 0.5212927213095552
gr: deceptive: pred: truthful The L2 to truth is: 0.8485730047435488 and to deception is: 0.8723455215175923
gr: deceptive: pred: deceptive The L2 to truth is: 0.6041951910248893 and to deception is: 0.5732903043623461
gr: truthful: pred: truthful The L2 to truth is: 0.6611660011288522 and to deception is: 0.6989618373916872
gr: truthful: pred: truthful The L2 to truth is: 0.5234090549017032 and to deception is: 0.538181739333183
gr: truthful: pred: truth