In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import torch
import random
import torch

random.seed(42)

In [None]:
data = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/train.csv")

## Experimental Data Analysis

In [None]:
data.groupby(by = "discourse_type").count()

# Obtaining the topics

This part is responsible for providing some ground truth as to which essay belongs to which topic, to simplify the semantic search retrieval problem. 

In [None]:
!pip install bertopic[use]

In [None]:
essay_collection = data.groupby("essay_id").agg(" ".join)
print(essay_collection)

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=15)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model, umap_model = umap_model, hdbscan_model=cluster_model)
topics, probs = topic_model.fit_transform(essay_collection['discourse_text'])

In [None]:
topic_model.get_topics()

In [None]:
essay_collection['topics'] = topics
essay_topic_map = {i: essay_collection.loc[i, 'topics'] for i in essay_collection.index}
data['topics'] = data['essay_id'].map(essay_topic_map)

In [None]:
data.head()

In [None]:
data.to_csv('/kaggle/working/topic_mapping.csv')

In [None]:
# claim_evidence_data = data.loc[data['discourse_type'].isin(['Claim', 'Evidence'])].reset_index()

In [None]:
# claim_evidence_data.head()

In [None]:
sentence_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embeddings = sentence_embedding_model.encode(list(data.loc[data['discourse_effectiveness'] == "Effective"].discourse_text))
e_indices = data.loc[data['discourse_effectiveness'] == "Effective"].index.values
print(e_indices[0])
e_topics = data.loc[e_indices, 'topics'].values


In [None]:
e_topics

In [None]:
topic_index_mapping = {i : [] for i in range(0,15)}
for row in range(len(e_topics)):
    topic_index_mapping[e_topics[row]].append(e_indices[row])

In [None]:
def find_best_effective_example(query_indices, effective_indices, similarity_matrix, topic_index_mapping, context = False):
    # print(query, discourse_type)
    # We are assuming that the query is either ineffective or adequate
    queries = data.loc[query_indices, :]
    query_topics, _ = topic_model.transform(queries['discourse_text'].values)
    better_examples = []
    print("Mapped Query Topics")
    for idx in range(len(queries)):
        topic = query_topics[idx]
        print("\nQuery:", queries.values[idx][2])
        discourse_element = queries.loc[queries.index.values[idx], 'discourse_type']
        # print(discourse_element)
        """1. Extract the indices for that topic - only effective ones are shown"""
        related_indices = topic_index_mapping[topic]
    
        """2. Extract those values from the dataframe which have the same discourse type"""
        same_element = data.loc[related_indices, :][data['discourse_type'] == discourse_element]
        # print(same_element)
        similarity_score = similarity_matrix[idx, torch.argmax(similarity_matrix[idx])]
        # print(similarity_score)
        example = data.loc[effective_indices[torch.argmax(similarity_matrix[idx])], 'discourse_text']
        # print(effective_indices[torch.argmax(similarity_matrix[idx])])
        better_examples.append(example)
        if context:
            print("Query context:", queries.loc[queries.index.values[idx], 'context'])
            print("Matching context:", data.loc[effective_indices[torch.argmax(similarity_matrix[idx])], 'context'])
        print("Better example:", example)
    return better_examples
        
    

    
    
        
        
        
    
    """
        
    topic_related_data = data.loc[data['topics'] == query_topic[0]][data['discourse_type'] == discourse_type][data['discourse_effectiveness'] == 'Effective']
    rows = topic_related_data.index
    print("Row count", len(rows))
    encoded_query = sentence_embedding_model.encode(query)
    similarities = util.cos_sim(encoded_query, embeddings[rows])
    # print(len(rows) == len(similarities[0]))
    new_data = pd.DataFrame(data, columns = data.columns)
    # print(similarities)
    new_data.loc[rows, 'sim_scores'] = list(similarities[0])
    return new_data.sort_values(by = 'sim_scores', ascending = False).head(1)['discourse_text'].values"""

In [None]:
test_set = data.loc[data['discourse_effectiveness'] != "Effective"]
q_indices = test_set.index
test_set.shape

In [None]:
query_embeddings = sentence_embedding_model.encode(list(test_set['discourse_text']))

In [None]:
embeddings.shape

In [None]:
similarity_matrix = util.cos_sim(query_embeddings, embeddings)  #query embeddings vs effective embeddings

In [None]:
test_set['predictions'] = find_best_effective_example(q_indices, e_indices, similarity_matrix, topic_index_mapping)

In [None]:
# data['predictions'] = data.apply(lambda x : find_best_effective_example(x['discourse_text'], x['discourse_type']), axis = 1)

In [None]:
test_set.to_csv('/kaggle/working/dataset_with_best_example_and_topic.csv')

In [None]:

#test_data  = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [None]:
#find_best_effective_example(test_data.loc[3, 'discourse_text'], test_data.loc[3, 'discourse_type'])
    

In [None]:
#test_set = data.loc[data['discourse_effectiveness'] != 'Effective'].sample(1000)
#embeddings 

In [None]:
test_set.head()

In [None]:
# test_set['predictions'] = test_set.apply(lambda x : find_best_effective_example(x['discourse_text'], x['discourse_type']), axis = 1)

In [None]:
import gc
gc.collect()

In [None]:
"""from pathlib import Path  
filepath = Path('/kaggle/working/test_predictions.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
test_set.to_csv(filepath) """ 

# What if we take the essays with the most similar context and then output the corresponding discourse element as an example?


In [None]:
random_id = data['essay_id'].sample(1).values[0]
print(random_id)
new_data = data.loc[data['essay_id'] == random_id]
for index, row in new_data.iterrows():
    print("X ", row["discourse_text"]," ",row["discourse_type"], " ", row["discourse_effectiveness"])

In [None]:
data.isnull().sum()

In [None]:
def get_context(row, dataset):
    discourse_element = row['discourse_type']
    # print("Current discourse element:", discourse_element)
    essay_id = row['essay_id']
    # print("Current essay ID:", essay_id)
    essay = dataset.loc[dataset["essay_id"] == essay_id]
    # print(row)
    if discourse_element in ["Lead", "Position", "Counterclaim"]:
        # return row["discourse_text"]
        # print("One of lead, position, counterclaim. No change.")
        return [row['discourse_text']]
    elif discourse_element in ["Claim"]:
        context = essay[essay["discourse_type"].isin(["Lead", "Position"])]
        evidence_found = 0
        for _, rrow in essay.iloc[row.name::1].iterrows():
            if rrow["discourse_type"] == "Evidence":
                # print("Adding evidence to context")
                evidence_found = 1
                context = context.append(rrow)
            elif not evidence_found:
                continue
            else:
                break
        # return context.groupby(['essay_id'])['discourse_text'].transform(lambda x: ' '.join(x))
        context.drop_duplicates(inplace = True, keep = "first")
    
    elif discourse_element == "Evidence":
        context = essay[essay["discourse_type"].isin(["Lead", "Position"])]
        evidence_found = None
        for index, rrow in essay.sort_index(ascending = False).iloc[row.name::-1].iterrows():
            if rrow["discourse_type"] == "Claim":
                # print("Claim found for this evidence")
                evidence_found = "Claim"
                context = context.append(rrow)
                break
            elif rrow["discourse_type"] == "Rebuttal":
                # print("Rebuttal found for this evidence")
                evidence_found = "Rebuttal"
                context = context.append(rrow)
                continue
            elif rrow["discourse_type"] == "Counterclaim":
                # print("Counterclaim found for this evidence")
                evidence_found = "Counterclaim"
                context = context.append(rrow)
                break
            else:
                continue
        context.drop_duplicates(inplace = True, keep = "first")
    
    elif discourse_element == "Rebuttal":
        context = essay[essay["discourse_type"].isin(["Lead", "Position"])]
        evidence_found = 0
        for _, rrow in dataset[row.name::1].iterrows():
            if rrow["discourse_type"] == "Evidence":
                evidence_found = 1
                # print(rrow["discourse_text"])
                context = context.append(rrow)
            elif not evidence_found:
                continue
            else:
                break
        for _, rrow in dataset.iloc[row.name::-1].iterrows():
            # print(rrow["discourse_text"])
            if rrow["discourse_type"] == "Counterclaim":
                evidence_found = "Counterclaim"
                # print("TADA")
                context = context.append(rrow)
                # print(context.values)
                break
            else:
                continue
        context.drop_duplicates(inplace = True, keep = "first")
        
        
    elif discourse_element == 'Concluding Statement':
        context = essay[essay["discourse_type"] != discourse_element]
        context.drop_duplicates(inplace = True, keep = "first")
    return context['discourse_text']
        
                
            
        
        
        
        
    

In [None]:
get_context(data.iloc[5, :], data).values

In [None]:
for index, row in data.iterrows():
    context_text = ""
    try:
        retrieved_context = get_context(row, data).values
    except:
        retrieved_context = get_context(row, data)
    # print(retrieved_context)
    context_text = ' '.join(retrieved_context)
    data.loc[index, 'context'] = context_text
    

In [None]:
data.to_csv('full_context.csv')

In [None]:
effective_context_embeddings = sentence_embedding_model.encode(list(data.loc[e_indices, 'context']))
query_context_embeddings = sentence_embedding_model.encode(list(data.loc[q_indices, 'context']))
context_matrix = util.cos_sim(query_context_embeddings, effective_context_embeddings)

## Extracting the elements with the most similar context

In [None]:
test_set['context_predictions'] = find_best_effective_example(q_indices, e_indices, context_matrix, topic_index_mapping, context = True)

In [None]:
test_set.to_csv('context_examples.csv')

# Augmenting the matrix to focus on the combination of contextual similarity and prompt similarity


In [None]:
augmented_sim_matrix = torch.mul(similarity_matrix, context_matrix)

In [None]:
test_set['augmented_predictions'] = find_best_effective_example(q_indices, e_indices, augmented_sim_matrix, topic_index_mapping, context = True)

In [None]:
test_set.to_csv('augmented_predictions_all.csv')