In [19]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
file_path = 'data/raw_news_data/bitcoin_news.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [7]:
short_data = data[:30]
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
titles = [article['title'] for article in short_data]
summaries = [article['summary'] for article in short_data]
full_articles = [article['full_article'] for article in short_data]

In [8]:
titles_embeddings = model.encode(titles)
summaries_embeddings = model.encode(summaries)
full_articles_embeddings = model.encode(full_articles)

In [21]:
query = "Give information about latest bitcoin news"
query_embedding = model.encode(query)


# Compute similarity scores for full articles, titles, and summaries
full_articles_similarities = [model.similarity(query_embedding, emb) for emb in full_articles_embeddings]
titles_similarities = [model.similarity(query_embedding, emb) for emb in titles_embeddings]
summaries_similarities = [model.similarity(query_embedding, emb) for emb in summaries_embeddings]

# Normalize the similarity scores
scaler = MinMaxScaler()
full_articles_similarities = scaler.fit_transform(np.array(full_articles_similarities).reshape(-1, 1)).flatten()
titles_similarities = scaler.fit_transform(np.array(titles_similarities).reshape(-1, 1)).flatten()
summaries_similarities = scaler.fit_transform(np.array(summaries_similarities).reshape(-1, 1)).flatten()

# Combine the normalized scores using a weighted sum
weights = {'full_articles': 0.5, 'titles': 0.3, 'summaries': 0.2}
combined_scores = (weights['full_articles'] * full_articles_similarities +
                   weights['titles'] * titles_similarities +
                   weights['summaries'] * summaries_similarities)

# Retrieve the top-k elements based on the combined scores
k = 3
top_k_indices = np.argsort(combined_scores)[-k:][::-1]
top_k_similarities = [(titles[i], summaries[i], full_articles[i], combined_scores[i]) for i in top_k_indices]

# Print the top-k results
for title, summary, article, similarity in top_k_similarities:
    print(f"Title: {title}")
    print(f"Summary: {summary}")
    print(f"Content: {article}")
    print(f"Similarity: {similarity}\n")


Title: Royal Capital Pro Sees Record Purchases of Cryptocurrencies in the Arabian Peninsula
Summary: 
Content: TALLINN, Estonia, January 8, 2018 /PRNewswire/ -- [Royal Capital Pro]( the Arabian Peninsula's leading online trading platform for forex and more than 70 other underlying assets, including indices, commodities and stocks, has announced that its sales of cryptocurrencies have now reached millions of dollars and continue to grow at a whopping rate of 200 percent a month. "The unprecedented demand is due to the simple fact that bitcoins are simply running out," states Nasser Hashem, Chief Broker at Royal Capital Pro. "The natural response from investors is to purchase as many as they can right now before the last one is sold." Bitcoin, the world's first decentralized digital currency, was structured from the start to expand to a finite number of 21 million, but only four million remain to be created. That simple statistic means that their value will continue to increase in revers