In [5]:
!pip install transformers --quiet
!pip install accelerate --quiet
!pip install bitsandbytes --quiet
!pip install langchain --quiet

In [6]:
pip install langchain-community langchain-core --quiet

Note: you may need to restart the kernel to use updated packages.


In [7]:
# General packages
import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from textwrap import fill
from IPython.display import Markdown, display # for formating Python display folowing markdown language
import warnings
warnings.filterwarnings('ignore') # avoid warning messages importing packages

In [8]:
# Mistral and LangChain packages (prompt engineering)
import torch
from langchain import PromptTemplate, HuggingFacePipeline
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

2024-08-17 07:57:39.676433: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 07:57:39.676594: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 07:57:39.819475: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
# Model version of Mistral
MODEL_NAME = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Quantization is a technique used to reduce the memory and computation requirements 
# of deep learning models, typically by using fewer bits, 4 bits
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Initialization of a tokenizer for the Mistral-7b model, 
# necessary to preprocess text data for input
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Initialization of the pre-trained language Mistral-7b
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quantization_config
)

# Configuration of some generation-related settings
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 20 # maximum number of new tokens that can be generated by the model
generation_config.temperature = 0.7 # randomness of the generated tex
generation_config.top_p = 0.95 # diversity of the generated text
generation_config.do_sample = True # sampling during the generation process
generation_config.repetition_penalty = 1.15 # the degree to which the model should avoid repeating tokens in the generated text

# A pipeline is an object that works as an API for calling the model
# The pipeline is made of (1) the tokenizer instance, the model instance, and
# some post-procesing settings. Here, it's configured to return full-text outputs
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# HuggingFace pipeline
llm = HuggingFacePipeline(pipeline=pipe)

  warn_deprecated(


In [51]:
import re
# Define the emotion labels
subject_labels =  ['computer science','physics', 'mathematics', 'statistics', 'quantitative biology', 'quantitative finance']

In [53]:

def extract_subject(text):
    """Extracts text outside [INST] and [/INST] tags."""
    last_inst_index = text.rfind("[/INST]")
    if last_inst_index != -1:
        my_text = text[last_inst_index + len("[/INST]") :]
        my_text= my_text.lower()
        #print(my_text, "%%%%%%%%")
        response_text = my_text.replace('[', ' ').replace('\n',' ').replace(']', ' ').replace('\'', ' ').replace('.',' ')
        detected_subject = response_text.split(" ")
        #print('***********',detected_emotions)
        return [1 if subject in detected_subject else 0 for subject in subject_labels]
    else:
        return [0] * len(subject_labels)



In [54]:
import pandas as pd
import csv
data= pd.read_csv('/kaggle/input/abstract-multi-label-dataset/Test_Sample.csv')
data_t = data.copy()
#Extract abstract text from dataset
test_texts = data_t['ABSTRACT'].tolist()
#Extract actual_values for subject
data_t.drop(columns=['TITLE']).values
test_labels = data_t.drop(columns=['ID','ABSTRACT','TITLE']).values

In [55]:
predictions = []
for text in test_texts:
    #text = "i want to go out  of this house"
    # In English

    query = f"""[INST] Being an expert who can detect subject in a given abstract
        enclosed in “” please read the provided abstract. 
        Identify and list the subject that the provided abstract might be of or closest to, list only one
        Consider the following subjects: 
        Computer Science
        Physics
        Mathematics
        Statistics
        Quantitative Biology
        Quantitative Finance
        Other
        If no subjects are detected or the abstract is unclear, return ['Other']. 
        your answer should be in the form of a Python list containing subject label only mentioned above and presented like as an example
        given below:
        Subject : ['Statistics']
            Text: {text}
            Subject: 
            .[/INST] """
    result = llm(query)
    #display(Markdown(f"<b>{query.removeprefix('[INST]').removesuffix('[/INST]')}</b>"))
    #display(Markdown(f"<p>{result}</p>"))
    r_label = extract_subject(result)
    print("****", r_label)
    predictions.append(r_label)

**** [0, 0, 0, 1, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 1, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 0, 0, 0, 0, 0]
**** [0, 1, 0, 0, 0, 0]
**** [0, 0, 0, 1, 0, 0]


In [56]:
results_df = pd.DataFrame(predictions, columns=subject_labels)
print(results_df)
actual_df = pd.DataFrame(test_labels, columns=subject_labels)
print(actual_df)

    computer science  physics  mathematics  statistics  quantitative biology  \
0                  0        0            0           1                     0   
1                  0        1            0           0                     0   
2                  0        1            0           0                     0   
3                  0        0            0           0                     0   
4                  0        1            0           0                     0   
5                  0        0            0           0                     0   
6                  0        0            1           0                     0   
7                  0        0            0           0                     0   
8                  0        0            0           0                     0   
9                  0        0            0           0                     0   
10                 0        0            0           0                     0   
11                 0        0           

In [57]:
for i in range(20):
    print(test_labels[i],"\n",predictions[i], "\n\n")
#print(test_labels, "\n\n", predictions)

[0 0 1 1 0 0] 
 [0, 0, 0, 1, 0, 0] 


[0 1 1 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[1 0 0 1 0 0] 
 [0, 1, 0, 0, 0, 0] 


[0 0 1 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[1 0 1 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 0, 1, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 1 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 1 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 0 1 0 0 0] 
 [0, 0, 0, 0, 0, 0] 


[1 0 0 1 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 0 1 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[0 0 1 1 0 0] 
 [0, 0, 0, 0, 0, 0] 


[0 1 0 0 0 0] 
 [0, 1, 0, 0, 0, 0] 


[1 0 1 1 0 0] 
 [0, 0, 0, 0, 0, 0] 




In [58]:
from sklearn.metrics import f1_score, precision_score, recall_score
# Compute evaluation metrics
f1_micro = f1_score(test_labels, predictions, average='micro')
f1_macro = f1_score(test_labels, predictions, average='macro')
precision_micro = precision_score(test_labels, predictions, average='micro')
precision_macro = precision_score(test_labels, predictions, average='macro')
recall_micro = recall_score(test_labels, predictions, average='micro')
recall_macro = recall_score(test_labels, predictions, average='macro')

# Print the evaluation metrics
print(f"F1 Micro: {f1_micro}")
print(f"F1 Macro: {f1_macro}")
print(f"Precision Micro: {precision_micro}")
print(f"Precision Macro: {precision_macro}")
print(f"Recall Micro: {recall_micro}")
print(f"Recall Macro: {recall_macro}")

F1 Micro: 0.22727272727272727
F1 Macro: 0.150997150997151
Precision Micro: 0.5555555555555556
Precision Macro: 0.25
Recall Micro: 0.14285714285714285
Recall Macro: 0.11904761904761903
