# RAG

In [4]:
import pandas as pd

# Define the CSV file path
csv_file_path = "/Users/anirudhhegde/Desktop/Northeastern University/Natural Language Processing/Project/RAG-Based-Multilingual-News-Retrieval/dataset.csv"

# Load the CSV as a DataFrame
try:
    df = pd.read_csv(csv_file_path)
    print("DataFrame loaded successfully!")
    print(df.head())  # Display the first few rows of the DataFrame
except Exception as e:
    print(f"Error loading the CSV file: {e}")

DataFrame loaded successfully!
                                             summary     topic  \
0  Transport in a cattle carriage, smell of meat ...  Politics   
1  Marble zebra stripes, pompous buildings: Sinde...  Politics   
2  Oskar Lafontaine resigns as party chairman of ...  Politics   
3  The roots of poverty lie in the past. Haiti is...  Politics   
4  Black-yellow is not yet the dream coalition th...  Politics   

                                               title        date  \
0        Auschwitz: Memories of a Holocaust Survivor  00/01/2010   
1  Municipalities in Need (3): Sindelfingen - Bey...  00/01/2010   
2  Staff debate on the left - who is coming to La...  00/01/2010   
3             History of Haiti - Napoleon's disgrace  00/01/2010   
4  Black-and-yellow cabinet - Merkel's team in th...  00/01/2010   

                                     translated_text  
0  Transport in a cattle carriage, smell of meat ...  
1  Marble zebra stripes, pompous buildings: Sinde... 

In [5]:
df.head()

Unnamed: 0,summary,topic,title,date,translated_text
0,"Transport in a cattle carriage, smell of meat ...",Politics,Auschwitz: Memories of a Holocaust Survivor,00/01/2010,"Transport in a cattle carriage, smell of meat ..."
1,"Marble zebra stripes, pompous buildings: Sinde...",Politics,Municipalities in Need (3): Sindelfingen - Bey...,00/01/2010,"Marble zebra stripes, pompous buildings: Sinde..."
2,Oskar Lafontaine resigns as party chairman of ...,Politics,Staff debate on the left - who is coming to La...,00/01/2010,"This Monday, when the country’s left-wing lead..."
3,The roots of poverty lie in the past. Haiti is...,Politics,History of Haiti - Napoleon's disgrace,00/01/2010,The portrait of 1791 shows Haiti’s national he...
4,Black-yellow is not yet the dream coalition th...,Politics,Black-and-yellow cabinet - Merkel's team in th...,00/01/2010,New heads and old acquaintances: Angela Merkel...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   summary          3000 non-null   object
 1   topic            3000 non-null   object
 2   title            3000 non-null   object
 3   date             3000 non-null   object
 4   translated_text  3000 non-null   object
dtypes: object(5)
memory usage: 117.3+ KB


In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load the SentenceTransformer model for multilingual embeddings
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Generate embeddings for the 'translated_text'
print("Generating embeddings for the articles...")
df['embedding'] = df['translated_text'].apply(lambda x: embedding_model.encode(x, normalize_embeddings = True))

# Convert embeddings into a numpy array
embeddings = np.vstack(df['embedding'].values)

# Initialize FAISS index for vector storage and search
dimension = embeddings.shape[1]  # Embedding dimension
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance metric
faiss_index.add(embeddings)  # Add embeddings to the FAISS index
print(f"FAISS index contains {faiss_index.ntotal} items.")

Generating embeddings for the articles...
FAISS index contains 3000 items.


In [19]:
def encode_query(query):
    """Generate embedding for a user query."""
    return embedding_model.encode(query, normalize_embeddings =True)

# Example user query
# query = "America donald trump election"
query = 'climate change summit renewable energy policy global emissions 2024'
query_embedding = encode_query(query)

In [20]:
def search_faiss(query_embedding, top_k=5):
    """Retrieve top-k most similar articles from FAISS index."""
    query_embedding = np.array([query_embedding])  # Convert query to 2D array
    distances, indices = faiss_index.search(query_embedding, top_k)
    return indices[0], distances[0]

# Search for the most relevant articles
top_k = 5
indices, distances = search_faiss(query_embedding, top_k=top_k)

# Retrieve articles based on indices
retrieved_articles = df.iloc[indices]
print("Retrieved Articles:")
print(retrieved_articles[['title','topic', 'translated_text', 'summary']])

Retrieved Articles:
                                                  title  \
2331  The 'glaciargate' brings another shock to heat...   
2009  The EU is unable to overcome the failure of Co...   
1954  Spain promotes a 30% increase in European emis...   
1504                 A 'community' to live from the air   
2821     Climate: the new industrial carbon tax targets   

                                                  topic  \
2331  The Committee recommends that the State party ...   
2009  The Committee recommends that the State party ...   
1954  The Committee recommends that the State party ...   
1504                                   diario catalunya   
2821                                           a-la-une   

                                        translated_text  \
2331  The Intergovernmental Panel on Climate Change ...   
2009  The European Environment Ministers failed to o...   
1954  Teresa Ribera is a special case: the Secretary...   
1504  The López family consumes ab

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
from bert_score import score
import pandas as pd

# Calculate cosine similarity and BERTScore for retrieved articles
def analyze_similarity(query, query_embedding, retrieved_articles, embeddings, indices):
    # Step 1: Cosine Similarity between query and retrieved articles
    retrieved_embeddings = embeddings[indices]
    cosine_similarities = cosine_similarity([query_embedding], retrieved_embeddings).flatten()
    
    # Add similarity scores to the retrieved articles
    retrieved_articles = retrieved_articles.copy()
    retrieved_articles['cosine_similarity'] = cosine_similarities
    
    # Sort articles by cosine similarity score
    retrieved_articles = retrieved_articles.sort_values(by='cosine_similarity', ascending=False)

    # Step 2: BERTScore Similarity for query-summaries and news-summaries
    if "summary" not in retrieved_articles.columns or "translated_text" not in retrieved_articles.columns:
        raise ValueError("DataFrame must contain 'summary' and 'translated_text' columns for similarity comparison.")

    # Task 1: Similarity Between Query and Summaries
    query_list = [query] * len(retrieved_articles)  # Repeat the query for all summaries
    _, _, F1_query_summary = score(
        retrieved_articles["summary"].tolist(),
        query_list,
        lang="en",
        model_type="bert-base-uncased"
    )
    retrieved_articles["query_summary_similarity"] = F1_query_summary.tolist()

    # Task 2: Similarity Between News Articles (translated_text) and Summaries
    _, _, F1_news_summary = score(
        retrieved_articles["summary"].tolist(),
        retrieved_articles["translated_text"].tolist(),
        lang="en",
        model_type="bert-base-uncased"
    )
    retrieved_articles["news_summary_similarity"] = F1_news_summary.tolist()

    # Step 3: Return only the title and similarity scores
    result_df = retrieved_articles[[
        "title", "cosine_similarity", "query_summary_similarity", "news_summary_similarity"
    ]]

    print("Titles and All Similarity Scores:")
    print(result_df)

    return result_df

# Usage example (you'll need to provide `query`, `query_embedding`, `retrieved_articles`, `embeddings`, `indices`):
final_df = analyze_similarity(query, query_embedding, retrieved_articles, embeddings, indices)

Titles and All Similarity Scores:
                                                  title  cosine_similarity  \
2331  The 'glaciargate' brings another shock to heat...           0.539866   
2009  The EU is unable to overcome the failure of Co...           0.539767   
1954  Spain promotes a 30% increase in European emis...           0.527164   
1504                 A 'community' to live from the air           0.499522   
2821     Climate: the new industrial carbon tax targets           0.484886   

      query_summary_similarity  news_summary_similarity  
2331                  0.364418                 0.504805  
2009                  0.413350                 0.572294  
1954                  0.353279                 0.443289  
1504                  0.348424                 0.460392  
2821                  0.347860                 0.431925  
