In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
# Import necessary libraries
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import faiss
import os

# Load sentiment analysis pipeline with a state-of-the-art model
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# Load embedding model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

# Load and sample the dataset
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/.kaggle')
!kaggle datasets download -d Cornell-University/arxiv --unzip
data_path = 'arxiv-metadata-oai-snapshot.json'
data = pd.read_json(data_path, lines=True)
sampled_data = data.sample(n=10000, random_state=42).reset_index(drop=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Dataset URL: https://www.kaggle.com/datasets/Cornell-University/arxiv
License(s): CC0-1.0
Downloading arxiv.zip to /content
100% 1.37G/1.37G [01:18<00:00, 21.6MB/s]
100% 1.37G/1.37G [01:18<00:00, 18.7MB/s]


In [None]:
# Encode abstracts
abstracts = sampled_data['abstract'].tolist()
abstract_embeddings = embedding_model.encode(abstracts, convert_to_numpy=True, show_progress_bar=True)

# Build FAISS index for efficient similarity search
dimension = abstract_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(abstract_embeddings)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [None]:
# Function to analyze sentiment of a given text
def analyze_sentiment(text):
    result = sentiment_analyzer(text)[0]
    return result['label'], result['score']

In [None]:
# Function to retrieve abstracts with opposite sentiment
def find_opposite_sentiment_abstracts(input_text, n=5):
    input_sentiment, _ = analyze_sentiment(input_text)
    input_embedding = embedding_model.encode(input_text, convert_to_numpy=True)

    distances, indices = index.search(np.array([input_embedding]), k=100)  # Retrieve top 100 matches
    filtered_results = []

    for idx in indices[0]:
        candidate_text = sampled_data.iloc[idx]['abstract']
        candidate_sentiment, _ = analyze_sentiment(candidate_text)

        if candidate_sentiment != input_sentiment:
            filtered_results.append(sampled_data.iloc[idx])
            if len(filtered_results) >= n:
                break

    # Display results directly from the list
    for row in filtered_results:
        print(f"Title: {row['title']}")
        print(f"Authors: {row['authors']}")
        print(f"Categories: {row['categories']}")
        print("Abstract:\n" + row['abstract'])
        print("\n" + "="*100 + "\n")

In [None]:
# Example usage
input_text = """
Machine learning has revolutionized numerous industries by significantly
enhancing predictive accuracy and automating complex tasks. From personalized
recommendations to advanced medical diagnostics, machine learning algorithms
enable unprecedented levels of efficiency and accuracy. In fields like finance,
healthcare, and marketing, machine learning models provide valuable insights
by processing vast amounts of data quickly and accurately, which would be
impossible through traditional methods. Moreover, the adaptability of machine
learning models allows them to improve over time, leading to continuous
optimization. This technology not only saves time and resources but also opens
up new avenues for research and development, promising a future of smarter,
data-driven decisions.
"""
n = 5
find_opposite_sentiment_abstracts(input_text, n)

Title: MARVIN: An Open Machine Learning Corpus and Environment for Automated
  Machine Learning Primitive Annotation and Execution
Authors: Chris A. Mattmann, Sujen Shah, Brian Wilson
Categories: cs.LG stat.ML
Abstract:
  In this demo paper, we introduce the DARPA D3M program for automatic machine
learning (ML) and JPL's MARVIN tool that provides an environment to locate,
annotate, and execute machine learning primitives for use in ML pipelines.
MARVIN is a web-based application and associated back-end interface written in
Python that enables composition of ML pipelines from hundreds of primitives
from the world of Scikit-Learn, Keras, DL4J and other widely used libraries.
MARVIN allows for the creation of Docker containers that run on Kubernetes
clusters within DARPA to provide an execution environment for automated machine
learning. MARVIN currently contains over 400 datasets and challenge problems
from a wide array of ML domains including routine classification and regression
to adv