In [None]:
import os
import torch
import logging
import spacy
import json
import pandas as pd
from dotenv import load_dotenv
from datasets import load_dataset, Dataset
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.callbacks import StdOutCallbackHandler
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, RagRetriever, RagSequenceForGeneration
from sentence_transformers import SentenceTransformer

In [None]:
 Load environment variables from .env file
load_dotenv()

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d : %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Spacy model for POS and DEP tagging
nlp = spacy.load("en_core_web_sm")

In [None]:
# Function for POS and DEP tagging
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    dep_tags = [token.dep_ for token in doc]
    return tokens, pos_tags, dep_tags

# Load and preprocess the dataset
def load_and_preprocess_dataset(dataset_name="squad", split="train[:10%]"):
    dataset = load_dataset(dataset_name, split=split)
    preprocessed_data = []
    
    for entry in tqdm(dataset):
        question = entry['question']
        context = entry['context']
        
        # POS and DEP tagging
        context_tokens, context_pos, context_dep = preprocess_text(context)
        question_tokens, question_pos, question_dep = preprocess_text(question)
        
        preprocessed_data.append({
            "context": context,
            "question": question,
            "context_tokens": context_tokens,
            "context_pos": context_pos,
            "context_dep": context_dep,
            "question_tokens": question_tokens,
            "question_pos": question_pos,
            "question_dep": question_dep,
        })
    
    return preprocessed_data

# Load JSON files and extract documents
def load_json_files(dataset_dir):
    json_files = [
        os.path.join(dataset_dir, os.getenv('DEP_MAPPING_FILE')),
        os.path.join(dataset_dir, os.getenv('POS_MAPPING_FILE')),
        os.path.join(dataset_dir, os.getenv('TEST_FILE')),
        os.path.join(dataset_dir, os.getenv('TRAIN_FILE')),
        os.path.join(dataset_dir, os.getenv('VAL_FILE'))
    ]

    documents = []
    for file in json_files:
        if not os.path.exists(file):
            print(f"Warning: {file} not found. Skipping.")
            continue
        
        with open(file, 'r') as f:
            try:
                data = json.load(f)
                if isinstance(data, dict):
                    data = [data]

                for item in data:
                    if 'question' in item:
                        document = {
                            'page_content': item['question'],
                            'metadata': {
                                'pos_tags': item.get('question_pos_tokens', []),
                                'dep_tags': item.get('question_dep_ids', []),
                            }
                        }
                        documents.append(Document(**document))
                    else:
                        print(f"Warning: No suitable field found in {file}. Skipping this item.")
            except json.JSONDecodeError:
                print(f"Error: Could not decode JSON from {file}. Skipping.")
    return documents

In [None]:
# Define paths to your datasets
dataset_dirs = [
    os.getenv('DATASET_DIR_1'), 
    os.getenv('DATASET_DIR_2'), 
    os.getenv('DATASET_DIR_3')
]
all_documents = []

# Loop through each dataset and load all the documents
for dataset_dir in dataset_dirs:
    documents = load_json_files(dataset_dir)
    all_documents.extend(documents)

print(f"Total documents loaded: {len(all_documents)}")

In [None]:
# Initialize embeddings and vector store
embed_model_id = 'multi-qa-distilbert-cos-v1'
store = LocalFileStore("./cache/")

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(all_documents, embedder)
vector_store.save_local('./vector_store/rag_task_vector_store')


In [None]:
# Load LLAMA model
model_id = "meta-llama/Llama-2-13b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=bnb_config,
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

generate_text = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    max_new_tokens=1000
)

llm = HuggingFacePipeline(pipeline=generate_text)


In [None]:
# Set up RetrievalQA with the vector store and LLAMA model
def ask_questions_from_df(df_questions, vector_store, llm):
    df_questions['Answer'] = None
    handler = StdOutCallbackHandler()

    for index, row in df_questions.iterrows():
        question = row['Question']
        response = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
            callbacks=[handler],
            return_source_documents=True
        )({"query": question})
        
        answer = response['result']
        df_questions.at[index, 'Answer'] = answer

    return df_questions

# Example of asking questions (replace with your own dataframe)
# df_questions = pd.read_csv("path_to_your_questions.csv")
# df_questions_with_answers = ask_questions_from_df(df_questions, vector_store, llm)
