In [None]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch
import hopsworks
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

In [None]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def get_embedding(dataset, embedding_object):
    embeddings = []
    for data in dataset["text"]:
        embedded_text = embedding_object.encode(data)
        embeddings.append(embedded_text)

    dataset_embedded = dataset.copy()
    dataset_embedded["embeddings"] = embeddings
    dataset_embedded = dataset_embedded.drop(columns=["text"])
    return dataset_embedded

In [None]:
financial_phrase_bank_df = load_data(os.path.join("base-data", "FinancialPhraseBank", "all-data-75-above.csv"))
zeroshot_train_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_train.csv"))
zeroshot_test_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_test.csv"))

In [None]:

# Assuming df1, df2, df3 are your dataframes
df = pd.concat([financial_phrase_bank_df, zeroshot_train_df, zeroshot_test_df])

# Get the count of each label
label_counts = df['label'].value_counts()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, stratify=df['label'])

# Now, X_train and y_train contain the training data and their corresponding labels
# X_test and y_test contain the test data and their corresponding labels

In [None]:
y_train_df = pd.DataFrame(y_train)
train_value_counts= y_train_df.value_counts()
# Get the percentage of each label in the training data
print(train_value_counts / train_value_counts.sum())

y_test_df = pd.DataFrame(y_test)
test_value_counts= y_test_df.value_counts()
print(test_value_counts / test_value_counts.sum())


In [None]:
train_dataset_df = pd.concat([X_train, y_train], axis=1)
test_dataset_df = pd.concat([X_test, y_test], axis=1)

train_dataset_df_embedded = get_embedding(train_dataset_df, tokenizer)
test_dataset_df_embedded = get_embedding(test_dataset_df, tokenizer)

train_dataset_df_embedded

In [None]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()

In [None]:
fg_train = fs.get_or_create_feature_group(name="news_sentiment_traindata", version=1, description="Training data and labels for financial news sentiment prediction model", primary_key=["label", "embeddings"], online_enabled=True)
fg_train.insert(train_dataset_df_embedded)

In [None]:
fg_test = fs.get_or_create_feature_group(name="news_sentiment_testdata", version=1, description="Test data and labels for financial news sentiment prediction model", primary_key=["label", "embeddings"], online_enabled=True)
fg_test.insert(test_dataset_df_embedded)