In [None]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch
import hopsworks


In [None]:
def create_sentiment_csv(file_path, file_name='sentiment.csv'):
    sentiment_map = {"negative": 0, "positive": 1, "neutral": 2}
    data = []

    with open(file_path, 'r', encoding="latin1") as file:
        for line in file:
            sentence, sentiment = line.split("@")
            sentiment = sentiment.strip()  # remove any trailing whitespace
            data.append([sentence, sentiment_map[sentiment]])

    df = pd.DataFrame(data, columns=["text", "label"])
    df.to_csv(file_name, index=False, sep=',')

def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def get_embedding(dataset, embedding_object):
    embeddings = []
    for data in dataset["text"]:
        embedded_text = embedding_object.encode(data)
        embeddings.append(embedded_text)

    dataset_embedded = dataset.copy()
    dataset_embedded["embeddings"] = embeddings
    dataset_embedded = dataset_embedded.drop(columns=["text"])
    return dataset_embedded

def get_decoding(dataset, embedding_object):
    decodings = []
    for data in dataset["embeddings"]:
        decoded_text = embedding_object.decode(data)
        decodings.append(decoded_text)

    dataset_decoded = dataset.copy()
    dataset_decoded["text"] = decodings
    dataset_decoded = dataset_decoded.drop(columns=["embeddings"])
    return dataset_decoded

In [None]:
financial_phrase_bank_df = load_data(os.path.join("base-data", "FinancialPhraseBank", "all-data-75-above.csv"))
zeroshot_train_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_train.csv"))
zeroshot_test_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_test.csv"))

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

In [None]:
embedded_financial_phrase_bank_df = get_embedding(financial_phrase_bank_df, tokenizer)

In [None]:
embedded_financial_phrase_bank_df

In [None]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()

In [None]:
embedding_fg = fs.get_or_create_feature_group(name="test", version=1, description="test", primary_key=["label", "embeddings"], online_enabled=True)
embedding_fg.insert(embedded_financial_phrase_bank_df)

In [None]:
connection = hopsworks.hsfs.connection()
fs = connection.get_feature_store(name="id2223labs_featurestore")
fg = fs.get_feature_group('test', version=1)

In [None]:
temp = fg.select(["embeddings", "label"]).show(5)

In [None]:
decoded = get_decoding(temp, tokenizer)
print(decoded["text"][0])