In [None]:
!pip install -q streamlit PyPDF2 langchain sentence_transformers faiss-cpu accelerate bitsandbytes gdown

In [None]:
!gdown --folder '1VTPGDVWpjgIt96K1gfe5MKUV-8ekW9cx' --output /content/

for futher information you can see link below:
1. https://colab.research.google.com/drive/12ADneM3JbJVKYTVWUqjGuYqlYs8mCnZ2?usp=sharing
2. https://docs.google.com/presentation/d/1sefa-ettZEsD6N6kOnp4AZx3ErVOrD4tqajCyuUPdTU/edit?usp=sharing
3. https://www.overleaf.com/read/bpbcxnszhjbk

# Scenario 1
We are several steps, there are:
1. Reading the pdf files using pdfreader
2. Langchain llms based on OpenAI and HuggingFaceHub
3. Splitting characters data using text splitter
5. do running chain based on docs and query on both models (OpenAI and HuggingFaceHub).


*if you want to use openAI key, you should checkout [this link](https://platform.openai.com/account/api-keys) and don't forget to create new openai token

In [None]:
%%writefile config_scenario1.json
{
  "pdf_param": "/content/paper_target/CLOUDITY_Cloud_Supply_Chain_Framework_Design_based_on_JUGO_and_Blockchain.pdf",
  "save_param": "/content/paper_target/data.json",
  "huggingface_token_param": "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",
  "openapi_token_param": "your open api key",
  "chunk_size_param": 1000,
  "chunk_overlap_param": 200,
  "huggingface_active": faiss,
  "openai_active": true,
  "queries": [
    "who are the authors of the article?",
    "What is the title from this article?",
    "What are Theoretical/ Conceptual Framework from this article?",
    "What are Research Question(s)/ Hypotheses from this article?",
    "How is the methodology works from this article?",
    "What is Analysis & Results study from this article?",
    "What is conclusion from this research?",
    "What is Implications for Future research from this research?",
    "What is Implication for practice from this research?",
    "How many relevancy to the research topic from that article From 1 to 10 ?",
    "What is inclusion in that article?",
    "What is justification (if applicable) from that article?"
  ]
}

Writing config_scenario1.json


In [None]:
%%writefile /content/scenario1.py
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def main():

    # set string parameters
    parser = argparse.ArgumentParser(description='Final Project: Scenario1')
    parser.add_argument('--config_path', type=str, default='config_scenario1.json', help='path to the JSON config file')

    # variable parser explanation
    args = parser.parse_args()

    config_path = args.config_path
    config = load_config(config_path)

    pdf_param = config['pdf_param']
    huggingface_token_param = config['huggingface_token_param']
    openapi_token_param = config['openapi_token_param']
    save_param = config['save_param']
    chunk_size_param = config['chunk_size_param']
    chunk_overlap_param = config['chunk_overlap_param']
    queries = config['queries']

    # location of the pdf file/files
    doc_reader = PdfReader(pdf_param)

    # read data from the file and put them into a variable called raw_text
    raw_text = ''
    for i, page in enumerate(doc_reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text

    print(raw_text[:100])

    # Splitting up the text into smaller chunks for indexing
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size= chunk_size_param,
        chunk_overlap= chunk_overlap_param,  # striding over the text
        length_function=len,
    )
    texts = text_splitter.split_text(raw_text)

    # Download embeddings from OpenAI
    if huggingface_active == True:
        os.environ['HUGGINGFACEHUB_API_TOKEN'] = huggingface_token_param
        embeddings= HuggingFaceEmbeddings()

    if openai_active == True:
        os.environ["OPENAI_API_KEY"] = openapi_token_param
        embeddings = OpenAIEmbeddings()

    docsearch = embeddings.from_texts(texts) # Indexing process without FAISS
    responses = []

    for query in queries:
        docs = docsearch.similarity_search(query)
        response = chain.run(input_documents=docs, question=query)
        responses.append(response)
        print(response)

    # Sample string
    data = {
      "author": response[0],
      "title": response[1],
      "Theoretical/ Conceptual Framework": response[2],
      "Research Question(s)/ Hypotheses": response[3],
      "methodology": response[4],
      "Analysis & Results study": response[5],
      "conclusion": response[6],
      "Implications for Future research": response[7],
      "Implication for practice": response[8],
    }

    # Save the JSON object to a file
    with open(save_param, 'w') as f:
        json.dump(data, f)

if __name__ == "__main__":
   main()

Writing /content/scenario1.py


# Scenario 2
We are several steps, there are:
1. Reading the pdf files using pdfreader
2. Langchain llms based on OpenAI and HuggingFaceHub
3. Splitting characters data using text splitter
4. do vectorstores (optional)
    - ElasticVectorSearch
    - Pinecone
    - Weaviate
    - FAISS
5. do load_qa_chain based on query for chaining quetion and answering from both models (OpenAI and. HuggingFaceHub).

*if you want to use openAI key, you should checkout [this link](https://platform.openai.com/account/api-keys) and don't forget to create new openai token

In [None]:
%%writefile config_scenario2.json
{
  "data_dir_param": "/content/paper_target/",
  "save_param": "/content/paper_target/data.json",
  "repo_id_param": "MBZUAI/LaMini-Neo-1.3B",
  "huggingface_token_param": "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",
  "openapi_token_param": "your open api key",
  "openapi_model_param": "text-davinci-003",
  "chain_type_param": "stuff",
  "device_param": "cuda",
  "temperature_param": 0.7,
  "max_length_param": 500,
  "pad_token_id_param": 50256,
  "top_p_param": 0.95,
  "repetation_penalty_param": 1.15,
  "chunk_size_param": 1000,
  "chunk_overlap_param": 200,
  "huggingface_active": true,
  "openapi_active": false,
  "queries": [
    "who are the authors of the article?",
    "What is the title from this article?",
    "What are Theoretical/ Conceptual Framework from this article?",
    "What are Research Question(s)/ Hypotheses from this article?",
    "How is the methodology works from this article?",
    "What is Analysis & Results study from this article?",
    "What is conclusion from this research?",
    "What is Implications for Future research from this research?",
    "What is Implication for practice from this research?",
    "How many relevancy to the research topic from that article From 1 to 10 ?",
    "What is inclusion in that article?",
    "What is justification (if applicable) from that article?"
  ]
}

Writing config_scenario2.json


In [None]:
%%writefile /content/scenario2.py
import os, json, argparse, torch
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import  FAISS #, ElasticVectorSearch, Pinecone, Weaviate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI, HuggingFaceHub
from langchain.callbacks import get_openai_callback
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def main():

    parser = argparse.ArgumentParser(description='Final Project: scenario2')
    parser.add_argument('--config_path', type=str, default='config_scenario2.json', help='path to the config.json file')

    # variable parser explanation
    args = parser.parse_args()

    config_path = args.config_path
    config = load_config(config_path)

    # Extract parameters from the config dictionary
    data_dir_param = config['data_dir_param']
    save_param = config['save_param']
    repo_id_param = config['repo_id_param']
    huggingface_token_param = config['huggingface_token_param']
    openapi_token_param = config['openapi_token_param']
    openapi_model_param = config['openapi_model_param']
    chain_type_param = config['chain_type_param']
    device_param = config['device_param']
    temperature_param = config['temperature_param']
    max_length_param = config['max_length_param']
    pad_token_id_param = config['pad_token_id_param']
    top_p_param = config['top_p_param']
    repetation_penalty_param = config['repetation_penalty_param']
    chunk_size_param = config['chunk_size_param']
    chunk_overlap_param = config['chunk_overlap_param']
    huggingface_active = config['huggingface_active']
    openapi_active = config['openapi_active']
    queries = config['queries']

    # load data directory
    data_dir = data_dir_param
    data_list = os.listdir(data_dir)

    data_article= []
    for file_name in data_list:
        file_path = os.path.join(data_dir, file_name)
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            article_data = f.read()
            data_article.append(article_data)

    # location of the pdf file/files.
    for num in data_list:
        reader = PdfReader(os.path.join(data_dir, num))
        # read data from the file and put them into a variable called raw_text
        raw_text = ''
        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                raw_text += text

        # We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
        text_splitter = CharacterTextSplitter(
            separator = "\n",
            chunk_size = chunk_size_param,
            chunk_overlap  = chunk_overlap_param,
            length_function = len,
        )
        texts = text_splitter.split_text(raw_text)
        print(len(texts))

        # Download embeddings from OpenAI
        if huggingface_active == True:
            os.environ['HUGGINGFACEHUB_API_TOKEN'] = huggingface_token_param
            embeddings= HuggingFaceEmbeddings()

            llm_hf = HuggingFaceHub(
                      repo_id= repo_id_param,
                      model_kwargs= {'temperature': temperature_param,
                                      'max_length': max_length_param,
                                      'pad_token_id': pad_token_id_param,
                                      'top_p': top_p_param,
                                      'device': device_param,
                                      'repetition_penalty': repetation_penalty_param}
            )

            chain = load_qa_chain(llm= llm_hf, chain_type= chain_type_param)

        if openapi_active == True:
            os.environ["OPENAI_API_KEY"] = openapi_token_param
            embeddings = OpenAIEmbeddings()
            chain = load_qa_chain(llm= OpenAI(model= openapi_model_param), chain_type= chain_type_param)

        docsearch = FAISS.from_texts(texts, embeddings)
        responses = []

        for query in queries:
            docs = docsearch.similarity_search(query)
            #response = chain.run(input_documents=docs, question=query, parameters={'truncation': 'only_first'})
            response = chain.run(input_documents=docs, question=query)
            responses.append(response)
            print(response)

        # Sample string
        data = {
          "author": response[0],
          "title": response[1],
          "Theoretical/ Conceptual Framework": response[2],
          "Research Question(s)/ Hypotheses": response[3],
          "methodology": response[4],
          "Analysis & Results study": response[5],
          "conclusion": response[6],
          "Implications for Future research": response[7],
          "Implication for practice": response[8],
        }

        # Save the JSON object to a file
        with open(save_param, 'w') as f:
            json.dump(data, f)

if __name__ == "__main__":
   main()


Writing /content/scenario2.py


# Approach
We are several steps, there are:
1. Load multiple files from directory
2. Divide and conquer using text splitter
3. Get embedding based on our documents
    - Huggingface Instructor Embeddings
    - Open AI's Embeddings
4. Testing Both Models

*if you want to use openAI key, you should checkout [this link](https://platform.openai.com/account/api-keys) and don't forget to create new openai token


In [None]:
%%writefile approach.json
{
  "pdf_param": "/content/paper_target/paper1.pdf",
  "folder_param": "/content/paper_target",
  "db_param": "/content/paper_target/db",
  "save_param": "/content/paper_target/data.json",

  "checkpoint_loader_param": "MBZUAI/LaMini-Neo-1.3B",
  "model_name_param": "hkunlp/instructor-xl",
  "chain_type_param": "stuff",
  "device_param": "cuda",
  "huggingface_token_param": "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",
  "openapi_token_param": "your open api key",
  "method_param": "chroma_instructor",
  "max_length_param": 512,
  "pad_token_id_param": 50256,
  "temperature_param": 0.7,
  "top_p_param": 0.95,
  "repetation_penalty_param": 1.15,
  "k_param": 3,
  "chunk_size_param": 1000,
  "chunk_overlap_param": 1000,
  "load_one_file_active": true,
  "load_one_folder_active": false,
  "local_llm_active": false,
  "openapi_llm_active": true,

  "queries": [
    "who are the authors of the article?",
    "What is the title from this article?",
    "What are Theoretical/ Conceptual Framework from this article?",
    "What are Research Question(s)/ Hypotheses from this article?",
    "How is the methodology works from this article?",
    "What is Analysis & Results study from this article?",
    "What is conclusion from this research?",
    "What is Implications for Future research from this research?",
    "What is Implication for practice from this research?"
    "How many relevancy to the research topic from that article From 1 to 10 ?",
    "What is inclusion in that article?",
    "What is justification (if applicable) from that article?"
  ]
}

Writing approach.json


In [None]:
%%writefile approach.py
import torch, transformers, os, json, textwrap, pickle, faiss, textwrap
from InstructorEmbedding import INSTRUCTOR
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def store_embeddings(docs, embeddings, store_name, path):
    vectorStore = FAISS.from_documents(docs, embeddings)
    with open(f"{path}/faiss_{store_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

def load_embeddings(store_name, path):
    with open(f"{path}/faiss_{store_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

def get_prompt(human_prompt):
    prompt_template=f"### Human: {human_prompt} \n### Assistant:"
    return prompt_template

def remove_human_text(text):
    return text.split('### Human:', 1)[0]

def parse_text(data):
    for item in data:
        text = item['generated_text']
        assistant_text_index = text.find('### Assistant:')
        if assistant_text_index != -1:
            assistant_text = text[assistant_text_index+len('### Assistant:'):].strip()
            assistant_text = remove_human_text(assistant_text)
            wrapped_text = textwrap.fill(assistant_text, width=100)
            print(wrapped_text)

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])


def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])


def main():
    parser = argparse.ArgumentParser(description='Final Project: Approach')
    parser.add_argument('--config_path', type=str, default='config_scenario2.json', help='path to the config.json file')

    # variable parser explanation
    args = parser.parse_args()

    config_path = args.config_path
    config = load_config(config_path)

    # Extract parameters from the config dictionary
    pdf_param = config['pdf_param']
    folder_param = config['folder_param']
    db_param = config['db_param']
    save_param = config['save_param']
    checkpoint_loader_param = config['checkpoint_loader_param']
    model_name_param = config['model_name_param']
    chain_type_param = config['chain_type_param']
    device_param = config['device_param']
    huggingface_token_param = config['huggingface_token_param']
    openapi_token_param = config['openapi_token_param']
    max_length_param = config['max_length_param']
    pad_token_id_param = config['pad_token_id_param']
    temperature_param = config['temperature_param']
    top_p_param = config['top_p_param']
    repetation_penalty_param = config['repetation_penalty_param']
    k_param = config['k_param']
    chunk_size_param = config['chunk_size_param']
    chunk_overlap_param = config['chunk_overlap_param']
    queries = config['queries']

    os.environ['HUGGINGFACEHUB_API_TOKEN'] = huggingface_token_param
    os.environ["OPENAI_API_KEY"] = openapi_token_param

    # Load and process the text files
    if load_one_file_active == True:
       loader = TextLoader(pdf_param)
    if load_one_folder_active == True:
       loader = DirectoryLoader(folder_param, glob="./*.pdf", loader_cls=PyPDFLoader)

    documents = loader.load()
    print("length of Documents: {}".format(len(documents)))

    #splitting the text into
    text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size_param, chunk_overlap= chunk_overlap_param)
    texts = text_splitter.split_documents(documents)
    print("length texts: {}".format(len(texts)))

    instructor_embeddings = HuggingFaceInstructEmbeddings(model_name= model_name_param,
                                                          model_kwargs={"device": device_param})

    if local_llm_active == True:
      tokenizer = AutoTokenizer.from_pretrained(checkpoint)
      base_model = AutoModelForCausalLM.from_pretrained(checkpoint_loader_param,
                                                        device_map='auto',
                                                        torch_dtype=torch.float16,
                                                        load_in_8bit=True)

      pipe = pipeline('text-generation',
                      model = base_model,
                      tokenizer = tokenizer,
                      max_length= max_length_param,
                      do_sample=True,
                      pad_token_id= pad_token_id_param,
                      temperature= temperature_param,
                      top_p= top_p_param,
                      repetition_penalty= repetation_penalty_param
                      )

      local_llm = HuggingFacePipeline(pipeline=pipe)
      print(local_llm(queries[0][1]))

    if openapi_llm_active == True:
      local_llm = OpenAI(temperature= temperature_param,)
      print(local_llm(queries[0][1]))


    if method_param == 'faiss_instructor':
        ##method 1: using faiss embedding store
        Embedding_store_path = folder_param
        store_embeddings(texts,
                    instructor_embeddings,
                    store_name='instructEmbeddings',
                    path=Embedding_store_path)

        db_instructEmbedd = load_embeddings(store_name='instructEmbeddings',
                                        path=Embedding_store_path)

        retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": k_param})

        # create the chain to answer questions
        qa_chain_instruction = RetrievalQA.from_chain_type(llm=OpenAI(temperature= temperature_param,),
                                                          chain_type= chain_type_param,
                                                          retriever=retriever,
                                                          return_source_documents=True)


    elif method_param == 'chroma_instructor':
       ## Embed and store the texts
       persist_directory = db_param ## Supplying a persist_directory will store the embeddings on disk
       ##method 2 {secodn scenario}:
       vectordb = Chroma.from_documents(documents=texts,
                                        embedding=instructor_embeddings,
                                        persist_directory=persist_directory)

       retriever = vectordb.as_retriever(search_kwargs={"k": k_param})

       # create the chain to answer questions
       qa_chain_instruction = RetrievalQA.from_chain_type(llm=OpenAI(temperature= temperature_param,),
                                                        chain_type= chain_type_param,
                                                        retriever=retriever,
                                                        return_source_documents=True)

    #example
    #docs = retriever.get_relevant_documents(queries[0][1])
    #print(docs[0])

    elif method_param == 'openai':
      #method 3 {third scenario}:
      store_embeddings(texts,
                       embeddings= OpenAIEmbeddings(),
                       store_name= 'openAIEmbeddings',
                       path=Embedding_store_path)

      db_openAIEmbedd = load_embeddings(store_name='openAIEmbeddings',
                                         path=Embedding_store_path)

      retriever_openai = db_openAIEmbedd.as_retriever(search_kwargs={"k": k_param})

      # create the chain to answer questions
      qa_chain_openai = RetrievalQA.from_chain_type(llm=OpenAI(temperature= temperature_param, ),
                                                    chain_type= chain_type_param,
                                                    retriever= retriever_openai,
                                                    return_source_documents=True)

    # print(get_prompt(queries[0][1]))

    # data = [{'generated_text': '### Human: What is the capital of England? \n### Assistant: The capital city of England is London.'}]
    # parse_text(data)

    # qa_chain.retriever.search_type , qa_chain.retriever.vectorstore
    # print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

    # qa_chain.combine_documents_chain.llm_chain.prompt.template ='''### Human: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
    # {context}
    # Question:  {question}
    # \n### Assistant:'''

    responses = []

    for query in queries:
        print('-------------------Instructor Embeddings------------------\n')
        if method2 == instruction_chain:
          llm_response = qa_chain_instruction(query)
        if method2 == openai_chain:
          llm_response = qa_chain_openai(query)

        response = process_llm_response(llm_response)
        responses.append(response)
        print(response)

    # Sample string
    data = {
      "author": response[0],
      "title": response[1],
      "Theoretical/ Conceptual Framework": response[2],
      "Research Question(s)/ Hypotheses": response[3],
      "methodology": response[4],
      "Analysis & Results study": response[5],
      "conclusion": response[6],
      "Implications for Future research": response[7],
      "Implication for practice": response[8],
    }

    # Save the JSON object to a file
    with open(save_param, 'w') as f:
        json.dump(data, f)



if __name__ == "__main__":
   main()


Writing approach.py


# Evaluation Table

In [None]:
!pip install similarity

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting similarity
  Downloading similarity-0.0.1-py3-none-any.whl (8.3 kB)
Collecting jellyfish (from similarity)
  Downloading jellyfish-0.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting interaction (from similarity)
  Downloading interaction-1.3-py3-none-any.whl (7.2 kB)
Installing collected packages: jellyfish, interaction, similarity
Successfully installed interaction-1.3 jellyfish-0.11.2 similarity-0.0.1


In [None]:
# %%writefile evaluation_onefeature.py
#@title Evaluation Expert Data One Feature
evaluation_data = '/content/paper_target/SLR Assessment.xlsx' #@param {type:"raw"}
# variable_data = "Implication for Future Research" #@param ["Title", "Abstract", "Introduction", "Methods", "Results", "Discussion/Conclusion", "Theoritical/ Conceptual Framework", "Implication for Future Research", "Implication for Practice from this Research"]
variable_data = "Abstract" #@param ["Title", "Abstract", "Introduction", "Methods", "Results", "Discussion/Conclusion", "Theoritical/ Conceptual Framework", "Implication for Future Research", "Implication for Practice from this Research"]
mod_data = "Justification" #@param ["Justification", "Relevancy", "Inclusion"]
custom_threshold = 0.55 #@param {type:"number"}
import similarity, pandas as pd

# Define a custom style function
def highlight_below_standard(value):
    if value < custom_threshold:
        return 'background-color: red'
    elif value >= custom_threshold and value < custom_threshold:
        return 'background-color: yellow'
    else:
        return 'background-color: green'

df_response = pd.read_excel(evaluation_data, sheet_name='response')# Read the Excel file into a pandas DataFrame based on sheet
df_system = pd.read_excel(evaluation_data, sheet_name='system')# Read the Excel file into a pandas DataFrame based on sheet

data1_paper1 = df_system['Study {} {} of Paper 1 Software Requirement Engineering'.format(variable_data, mod_data)]
data2_paper1 = df_response['Study {} {} of Paper 1 Software Requirement Engineering'.format(variable_data, mod_data)]
data1_paper2 = df_system['Study {} {} of Paper 2 Software Quality Assurance'.format(variable_data, mod_data)]
data2_paper2 = df_response['Study {} {} of Paper 2 Software Quality Assurance'.format(variable_data, mod_data)]
data1_paper3 = df_system['Study {} {} of Paper 3 Network-based Computing'.format(variable_data, mod_data)]
data2_paper3 = df_response['Study {} {} of Paper 3 Network-based Computing'.format(variable_data, mod_data)]
data1_paper4 = df_system['Study {} {} of Paper 4 Visual Image Computation'.format(variable_data, mod_data)]
data2_paper4 = df_response['Study {} {} of Paper 4 Visual Image Computation'.format(variable_data, mod_data)]
data1_paper5 = df_system['Study {} {} of Paper 5 Visual Image Computation'.format(variable_data, mod_data)]
data2_paper5 = df_response['Study {} {} of Paper 5 Visual Image Computation'.format(variable_data, mod_data)]
data_paper1 = [[similarity.get_string_similarity(num, angka) for angka in data2_paper1] for num in data1_paper1][0]
data_paper2 = [[similarity.get_string_similarity(num, angka) for angka in data2_paper2] for num in data1_paper2][0]
data_paper3 = [[similarity.get_string_similarity(num, angka) for angka in data2_paper3] for num in data1_paper3][0]
data_paper4 = [[similarity.get_string_similarity(num, angka) for angka in data2_paper4] for num in data1_paper4][0]
data_paper5 = [[similarity.get_string_similarity(num, angka) for angka in data2_paper5] for num in data1_paper5][0]

print("\nData System")
print(data1_paper1)
print("\nData Respondend")
print(data2_paper1)
print("\nMatrix Evaluation")
print(data_paper1)

df_sim = pd.DataFrame([data_paper1, data_paper2, data_paper3, data_paper4, data_paper5])
print("Evaluation Study {} {} w/ one feature".format(variable_data, mod_data))
# df_sim.columns = [num for id, num in enumerate(df_response.Name)]
df_sim.columns = ["author {}".format(id+1) for id, num in enumerate(df_sim.columns)]
df_sim.index = ["article {}".format(id+1) for id, num in enumerate(df_sim.index)]
df_sim
# df_sim.T.describe()
# df_sim.T.max()

# Apply the style function to the DataFrame
# styled_df = df_sim.style.applymap(highlight_below_standard, subset=['Score'])
# styled_df = df_sim.style.applymap(highlight_below_standard)
# print(styled_df)



Data System
0    This section provides an overview of the prima...
Name: Study Abstract Justification of Paper 1 Software Requirement Engineering, dtype: object

Data Respondend
0         abstract has fulfilled scientific principles
1    clearly and concisely contains an abstract com...
2                                              correct
3           Study Abstract is very good and applicable
4    It is concise and shows high clarity and accur...
5                                          applicable 
6    After read the abstract I know some important ...
7                                                   no
8    It is better to provide a clear drawback and i...
Name: Study Abstract Justification of Paper 1 Software Requirement Engineering, dtype: object

Matrix Evaluation
[0.5367519078859285, 0.5319379932781995, 0.5272459499263623, 0.5162561713982382, 0.5385052805155898, 0.5205516133351185, 0.5654147434105533, 0.5009818360333824, 0.5368550835555532]
Evaluation Study Abstract Justif

Unnamed: 0,author 1,author 2,author 3,author 4,author 5,author 6,author 7,author 8,author 9
article 1,0.536752,0.531938,0.527246,0.516256,0.538505,0.520552,0.565415,0.500982,0.536855
article 2,0.556195,0.500729,0.523175,0.546508,0.545967,0.540627,0.57222,0.557744,0.543831
article 3,0.5237,0.667325,0.551566,0.547528,0.543588,0.539921,0.561395,0.557532,0.588116
article 4,0.535647,0.0,0.521758,0.519017,0.500099,0.536933,0.564167,0.556635,0.556451
article 5,0.515576,0.667406,0.523282,0.522024,0.472,0.507391,0.534779,0.668145,0.529758


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# /content/gdrive/MyDrive/2ndSemester/wimu_assignment/final_project/paper_target/SLR Assessment.xlsx

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# %%writefile evaluation_moreFeatures.py
#@title Evaluation Expert Data More Features
evaluation_data = '/content/paper_target/SLR Assessment.xlsx' #@param {type:"string"}
variable_data = "Abstract" #@param ["Title", "Abstract", "Introduction", "Methods", "Results", "Discussion/Conclusion", "Theoritical/ Conceptual Framework", "Implication for Future Research", "Implication for Practice from this Research"]
# condition_data = "default" #@param ["default", "maximum", "threshold"]
custom_threshold = 0.65 #@param {type:"number"}
import similarity, pandas as pd, numpy as np

# Define a custom style function
def highlight_below_standard(value):
    if value < custom_threshold:
        return 'background-color: red'
    elif value >= custom_threshold and value < custom_threshold:
        return 'background-color: yellow'
    else:
        return 'background-color: green'

df_response = pd.read_excel(evaluation_data, sheet_name='response')# Read the Excel file into a pandas DataFrame based on sheet
df_system = pd.read_excel(evaluation_data, sheet_name='system')# Read the Excel file into a pandas DataFrame based on sheet


feature_names1 = ['Study {} Justification of Paper 1 Software Requirement Engineering'.format(variable_data),
                 'Study {} Relevancy of Paper 1 Software Requirement Engineering'.format(variable_data),
                 'Study {} Inclusion of Paper 1 Software Requirement Engineering'.format(variable_data)]
feature_names2 = ['Study {} Justification of Paper 2 Software Quality Assurance'.format(variable_data),
                 'Study {} Relevancy of Paper 2 Software Quality Assurance'.format(variable_data),
                 'Study {} Inclusion of Paper 2 Software Quality Assurance'.format(variable_data)]
feature_names3 = ['Study {} Justification of Paper 3 Network-based Computing'.format(variable_data),
                 'Study {} Relevancy of Paper 3 Network-based Computing'.format(variable_data),
                 'Study {} Inclusion of Paper 3 Network-based Computing'.format(variable_data)]
feature_names4 = ['Study {} Justification of Paper 4 Visual Image Computation'.format(variable_data),
                 'Study {} Relevancy of Paper 4 Visual Image Computation'.format(variable_data),
                 'Study {} Inclusion of Paper 4 Visual Image Computation'.format(variable_data)]
feature_names5 = ['Study {} Justification of Paper 5 Visual Image Computation'.format(variable_data),
                 'Study {} Relevancy of Paper 5 Visual Image Computation'.format(variable_data),
                 'Study {} Inclusion of Paper 5 Visual Image Computation'.format(variable_data)]

data1_paper1 = [df_system[name].values.tolist() for name in feature_names1]
data2_paper1 = [df_response[name].values.tolist() for name in feature_names1]
data1_paper2 = [df_system[name].values.tolist() for name in feature_names2]
data2_paper2 = [df_response[name].values.tolist() for name in feature_names2]
data1_paper3 = [df_system[name].values.tolist() for name in feature_names3]
data2_paper3 = [df_response[name].values.tolist() for name in feature_names3]
data1_paper4 = [df_system[name].values.tolist() for name in feature_names4]
data2_paper4 = [df_response[name].values.tolist() for name in feature_names4]
data1_paper5 = [df_system[name].values.tolist() for name in feature_names5]
data2_paper5 = [df_response[name].values.tolist() for name in feature_names5]

data_paper1 = [similarity.get_string_similarity(num, angka) for angka in data2_paper1 for num in data1_paper1]
data_paper2 = [similarity.get_string_similarity(num, angka) for angka in data2_paper2 for num in data1_paper2]
data_paper3 = [similarity.get_string_similarity(num, angka) for angka in data2_paper3 for num in data1_paper3]
data_paper4 = [similarity.get_string_similarity(num, angka) for angka in data2_paper4 for num in data1_paper4]
data_paper5 = [similarity.get_string_similarity(num, angka) for angka in data2_paper5 for num in data1_paper5]

print("\nData System")
print(data1_paper1)
print("\nData Respondend")
print(data2_paper1)
print("\Matrix Evaluation")
print(data_paper1)

print("Evaluation w/ more than one feature")

df_sim = pd.DataFrame([data_paper1, data_paper2, data_paper3, data_paper4, data_paper5])
df_sim.columns = ["respondent {}".format(id+1) for id, num in enumerate(df_sim.columns)]
df_sim.index = ["article {}".format(id+1) for id, num in enumerate(df_sim.index)]
df_sim

# if condition_data == "default":
#   print(df_sim)

# if condition_data == "maximum":
#   print(df_sim.T.max())

# if condition_data == "threshold":
#   #Apply the style function to the DataFrame
#   styled_df = df_sim.style.applymap(highlight_below_standard, subset=['Score'])
#   styled_df = df_sim.style.applymap(highlight_below_standard)
#   print(styled_df)


Data System
[['This section provides an overview of the primary basis of reference for the study and explains how it builds upon previous research. The authors mention that three preliminary studies form the primary basis of reference in this study to contain the stages of making research methodology to seek new dependencies among the SRS documents. They then explain how their proposed method builds upon previous research by explicitly explaining each abstraction of previous studies revealed. The continuation framework proposed in a previous study is used as a basis for identifying dependencies among requirements, and Natural Language Processing is used to extract dependency relations.'], [9], ['Yes']]

Data Respondend
[['abstract has fulfilled scientific principles', 'clearly and concisely contains an abstract component', 'correct', 'Study Abstract is very good and applicable', 'It is concise and shows high clarity and accuracy, indicating clarity of results', 'applicable ', 'After r

Unnamed: 0,respondent 1,respondent 2,respondent 3,respondent 4,respondent 5,respondent 6,respondent 7,respondent 8,respondent 9
article 1,0.684308,0.44514,0.508241,0.444869,0.579365,0.392857,0.399909,0.449735,0.650794
article 2,0.779274,0.445308,0.50908,0.416483,0.580247,0.393298,0.391533,0.449735,0.650794
article 3,0.693781,0.445402,0.576218,0.429993,0.580247,0.393298,0.436632,0.449735,0.650794
article 4,0.618989,0.445229,0.508683,0.412472,0.580247,0.393298,0.405376,0.449821,0.540195
article 5,0.630218,0.445712,0.444432,0.426374,0.456349,0.392857,0.411614,0.449909,0.540723


In [None]:
df_response.to_json('data_response.json', orient='records')
print("data_response JSON file saved successfully.")

df_system.to_json('data_system.json', orient='records')
print("data_system JSON file saved successfully.")

data_response JSON file saved successfully.
data_system JSON file saved successfully.


#Another Thunnel Scenario

In [None]:
%%writefile scenario1_streamlit.py
import streamlit as st
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def main():
    st.title('Final Project: Scenario 1')

    # Sidebar configuration
    st.sidebar.title('Configuration')
    config_path = st.sidebar.text_input('Config Path', 'config_scenario1.json')
    huggingface_token_param = st.sidebar.text_input('Huggingface Token Param', '')
    openapi_token_param = st.sidebar.text_input('OpenAPI Token Param', '')

    # PDF file selection
    st.header('PDF File Selection')
    pdf_param = st.file_uploader('Upload PDF file', type=['pdf'])
    if pdf_param is not None:
        doc_reader = PdfReader(pdf_param)
        raw_text = ''
        for i, page in enumerate(doc_reader.pages):
            text = page.extract_text()
            if text:
                raw_text += text
        st.text(raw_text[:100])

    # Text Splitting parameters
    st.header('Text Splitting Parameters')
    chunk_size_param = st.slider('Chunk Size', min_value=100, max_value=10000, value=1000)
    chunk_overlap_param = st.slider('Chunk Overlap', min_value=0, max_value=1000, value=200)

    # Query parameters
    st.header('Query Parameters')
    queries = []
    for i in range(9):
        query = st.text_input(f'Query {i}', '')
        queries.append(query)

    # Run button
    if st.button('Run'):
        config = {
            'pdf_param': pdf_param,
            'huggingface_token_param': huggingface_token_param,
            'openapi_token_param': openapi_token_param,
            'save_param': 'output.json',
            'chunk_size_param': chunk_size_param,
            'chunk_overlap_param': chunk_overlap_param,
            'queries': queries
        }

        # Location of the pdf file/files
        if pdf_param is not None:
            doc_reader = PdfReader(pdf_param)
            raw_text = ''
            for i, page in enumerate(doc_reader.pages):
                text = page.extract_text()
                if text:
                    raw_text += text

            # Splitting up the text into smaller chunks for indexing
            text_splitter = CharacterTextSplitter(
                separator="\n",
                chunk_size=chunk_size_param,
                chunk_overlap=chunk_overlap_param,
                length_function=len,
            )
            texts = text_splitter.split_text(raw_text)

            # Download embeddings from OpenAI
            if huggingface_token_param:
                os.environ['HUGGINGFACEHUB_API_TOKEN'] = huggingface_token_param
                embeddings = HuggingFaceEmbeddings()
            elif openapi_token_param:
                os.environ["OPENAI_API_KEY"] = openapi_token_param
                embeddings = OpenAIEmbeddings()

            docsearch = embeddings.from_texts(texts)  # Indexing process without FAISS
            responses = []

            for query in queries:
                docs = docsearch.similarity_search(query)
                response = chain.run(input_documents=docs, question=query)
                responses.append(response)
                st.text(response)

            # Sample string
            data = {
                "author": responses[0][0],
                "title": responses[0][1],
                "Theoretical/ Conceptual Framework": responses[0][2],
                "Research Question(s)/ Hypotheses": responses[0][3],
                "methodology": responses[0][4],
                "Analysis & Results study": responses[0][5],
                "conclusion": responses[0][6],
                "Implications for Future research": responses[0][7],
                "Implication for practice": responses[0][8],
            }

            # Save the JSON object to a file
            with open('output.json', 'w') as f:
                json.dump(data, f)

if __name__ == '__main__':
    main()


In [None]:
%%writefile scenario2_streamlit.py
import os, json, torch, streamlit as st
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI, HuggingFaceHub
from langchain.callbacks import get_openai_callback
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def load_config(config_path):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

def main():
    st.title('Final Project: Scenario 2')

    # Sidebar configuration
    st.sidebar.title('Configuration')
    config_path = st.sidebar.text_input('Config Path', 'config_scenario2.json')
    huggingface_token_param = st.sidebar.text_input('Huggingface Token Param', '')
    openapi_token_param = st.sidebar.text_input('OpenAPI Token Param', '')

    # Data directory selection
    st.header('Data Directory Selection')
    data_dir_param = st.text_input('Data Directory Path', '')

    # Run button
    if st.button('Run'):
        config = {
            'data_dir_param': data_dir_param,
            'save_param': 'output.json',
            'repo_id_param': '',
            'huggingface_token_param': huggingface_token_param,
            'openapi_token_param': openapi_token_param,
            'openapi_model_param': '',
            'chain_type_param': '',
            'device_param': '',
            'temperature_param': '',
            'max_length_param': '',
            'pad_token_id_param': '',
            'top_p_param': '',
            'repetation_penalty_param': '',
            'chunk_size_param': '',
            'chunk_overlap_param': '',
            'huggingface_active': True if huggingface_token_param else False,
            'openapi_active': True if openapi_token_param else False,
            'queries': []
        }

        if data_dir_param:
            data_list = os.listdir(data_dir_param)
            for num in data_list:
                reader = PdfReader(os.path.join(data_dir_param, num))
                raw_text = ''
                for i, page in enumerate(reader.pages):
                    text = page.extract_text()
                    if text:
                        raw_text += text

                text_splitter = CharacterTextSplitter(
                    separator="\n",
                    chunk_size=config['chunk_size_param'],
                    chunk_overlap=config['chunk_overlap_param'],
                    length_function=len,
                )
                texts = text_splitter.split_text(raw_text)

                if config['huggingface_active']:
                    os.environ['HUGGINGFACEHUB_API_TOKEN'] = config['huggingface_token_param']
                    embeddings = HuggingFaceEmbeddings()
                    llm_hf = HuggingFaceHub(
                        repo_id=config['repo_id_param'],
                        model_kwargs={
                            'temperature': config['temperature_param'],
                            'max_length': config['max_length_param'],
                            'pad_token_id': config['pad_token_id_param'],
                            'top_p': config['top_p_param'],
                            'device': config['device_param'],
                            'repetition_penalty': config['repetation_penalty_param']
                        }
                    )
                    chain = load_qa_chain(llm=llm_hf, chain_type=config['chain_type_param'])
                elif config['openapi_active']:
                    os.environ["OPENAI_API_KEY"] = config['openapi_token_param']
                    embeddings = OpenAIEmbeddings()
                    chain = load_qa_chain(llm=OpenAI(model=config['openapi_model_param']), chain_type=config['chain_type_param'])

                docsearch = FAISS.from_texts(texts, embeddings)
                responses = []

                for query in config['queries']:
                    docs = docsearch.similarity_search(query)
                    response = chain.run(input_documents=docs, question=query)
                    responses.append(response)

                data = {
                    "author": responses[0][0],
                    "title": responses[0][1],
                    "Theoretical/ Conceptual Framework": responses[0][2],
                    "Research Question(s)/ Hypotheses": responses[0][3],
                    "methodology": responses[0][4],
                    "Analysis & Results study": responses[0][5],
                    "conclusion": responses[0][6],
                    "Implications for Future research": responses[0][7],
                    "Implication for practice": responses[0][8]
                }

                with open('output.json', 'w') as f:
                    json.dump(data, f)

                st.success('Processing complete! Output saved to output.json')

if __name__ == "__main__":
    main()

In [None]:
# !npm install localtunnel
# !pip install -q ngrok streamlit
# !python --version
# !streamlit run scenario1.py & npx localtunnel --p 8501

In [None]:
# !streamlit run main.py & npx localtunnel --p 8501