# MindMender Backend

## Install Libraries

In [1]:
# For running the server

!pip install flask flask_cors
!pip install pyngrok

# For generatinfg the answer
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  pinecone-client==2.2.2 \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0



In [2]:
# for GPU use
# !CT_CUBLAS=1 pip install ctransformers --no-binary ctransformers
# choose your champion
#model_id = "TheBloke/Llama-2-7B-GGML"
#model_id = "TheBloke/Llama-2-7B-chat-GGML"
#model_id = "TheBloke/Llama-2-13B-GGML"
# model_id = "TheBloke/Llama-2-13B-chat-GGML"

In [3]:
# from ctransformers import AutoModelForCausalLM
# def ask_llama(text):

#   # check ctransformers doc for more configs
#   config = {'max_new_tokens': 256, 'repetition_penalty': 1.1,
#             'temperature': 0.1, 'stream': True}

#   llm = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         model_type="llama",
#         #lib='avx2', for cpu use
#         gpu_layers=130, #110 for 7b, 130 for 13b
#         **config
#         )

#   prompt="""You are now a therapist (you are not a patient so you have to answer from the point of view of the therapist) who's goal is to answer messages from patients with empathy. In your answers, you can also use metaphors to make the response deeper. The conversation should not be too formal. Do not start with "Hey there" or "Hello", etc. Go straight to the corpus of the message. Only write the response from the therapist point of view (yours) and nothing else. Do not write a disclaimer at the beginning of the message either. This is the message you should answer: '"""+text+"""'"""

#   tokens = llm.tokenize(prompt)

#   return llm(prompt, stream=False)

In [4]:
# ask_llama("I have been feeling depressed recently")

## Importing Libraries

In [5]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from torch import cuda, bfloat16
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
import transformers
import os
import pinecone
import time

##############################
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

## Initializing a RetrievalQA Chain
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA

## Response Generation

In [9]:
# get API key from app.pinecone.io and environment from console
# pinecone.init(
    # api_key=os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY',
    # environment=os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENV'
# )
from google.colab import userdata


pinecone.init(
  api_key=userdata.get('PINECONE_API_KEY') ,
    environment=userdata.get('PINECONE_ENVIRONMENT')
)

# Now we initialize the index.
index_name = 'llama-2-rag'

# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(
#         index_name,
#         dimension=len(embeddings[0]),
#         metric='cosine'
#     )
#     # wait for index to finish initialization
#     while not pinecone.describe_index(index_name).status['ready']:
#         time.sleep(1)


# With our index and embedding process ready we can move onto the indexing process itself. For that, we'll need a dataset.
index = pinecone.Index(index_name)
index.describe_index_stats()


{'dimension': 384,
 'index_fullness': 0.00507,
 'namespaces': {'': {'vector_count': 507}},
 'total_vector_count': 507}

In [11]:
def get_llama():
    HF_AUTH_TOKEN = userdata.get('HF_AUTH_TOKEN')
    print(HF_AUTH_TOKEN)



    model_id = 'meta-llama/Llama-2-13b-chat-hf'


    device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
    # device = "cuda" if torch.cuda.is_available() else "cpu"


    # set quantization configuration to load large model with less GPU memory
    # this requires the `bitsandbytes` library
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )

    # begin initializing HF items, need auth token for these
    # hf_auth = 'HF_AUTH_TOKEN'

    model_config = transformers.AutoConfig.from_pretrained(
        model_id,
        use_auth_token=HF_AUTH_TOKEN
    )

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        # device_map={'': 'cpu'},
        device_map='auto',
        use_auth_token=HF_AUTH_TOKEN
    )

    # llm = HuggingFacePipeline(pipeline=generate_text)

    model.eval()
    print(f"Model loaded on {device}")

    # The pipeline requires a tokenizer which handles the translation of human readable plaintext
    # to LLM readable token IDs. The Llama 2 13B models were trained using the Llama 2 13B tokenizer,
    # which we initialize like so:
    tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=HF_AUTH_TOKEN
    )

    # Now we're ready to initialize the HF pipeline.
    # There are a few additional parameters that we must define here.
    # Comments explaining these have been included in the code.
    generate_text = transformers.pipeline(
        model=model, tokenizer=tokenizer,
        return_full_text=True,  # langchain expects the full text
        task='text-generation',
        # we pass model parameters here too
        temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=512,  # mex number of tokens to generate in the output
        repetition_penalty=1.1  # without this output begins repeating
    )

    # res = generate_text(insert_prompt)
    # print(res[0]["generated_text"])

    llm = HuggingFacePipeline(pipeline=generate_text)

    return llm

In [12]:
def get_embed_model():
  embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

  device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

  embed_model = HuggingFaceEmbeddings(
      model_name=embed_model_id,
      model_kwargs={'device': device},
      encode_kwargs={'device': device, 'batch_size': 32}
  )

  return embed_model

In [13]:
embed_model=get_embed_model()

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [14]:
def get_rag(query, llm, embed_model=embed_model, index=index):
  text_source = 'text'  # field in metadata that contains text content

  vectorstore = Pinecone(index, embed_model.embed_query, text_source)

  # query = 'what makes llama 2 special?'
  # query=insert_prompt

  vectorstore.similarity_search(
      query,  # the search query
      k=3  # returns top 3 most relevant chunks of text
  )

  # Now we can put our `vectorstore` and `llm` together to create our RAG pipeline.
  rag_pipeline = RetrievalQA.from_chain_type(
      llm=llm, chain_type='stuff',
      retriever=vectorstore.as_retriever()
  )
  return rag_pipeline

In [17]:
llm=get_llama()    #llm = HuggingFacePipeline(pipeline=generate_text)
# rag_pipeline=get_rag(query=insert_prompt, llm=llm)

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


config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [18]:
def ask_llama(text,llm=llm, index=index):

    rag_pipeline=get_rag(query=text, llm=llm)

    rag_response=rag_pipeline(text)
    #print("\nResponse RAG",rag_response)

    return rag_response

# Starting the server

In [19]:
from datetime import datetime

def getTime():
  now = datetime.now()
  print(now.strftime("%H:%M:%S"))
  return now.strftime("%H:%M:%S")
getTime()

00:13:52


'00:13:52'

In [20]:
import getpass
import os
import threading
from datetime import datetime
from flask import Flask, request,jsonify
from pyngrok import ngrok, conf
from flask_cors import CORS

print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken")
conf.get_default().auth_token = getpass.getpass()

app = Flask(__name__)
CORS(app)
# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(5000).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}/\"".format(public_url, 5000))

# Update any base URLs to use the public ngrok URL
app.config["BASE_URL"] = public_url

# ... Update inbound traffic via APIs to use the public-facing ngrok URL
# token = 2eVShDWCaeGMKSeY1PG4zTYQTK8_5XetyjD6qojnwWTVLjWFf

# Define Flask routes

@app.route('/process_text', methods=['POST'])
def process_text():
    data = request.get_json()
    text = data.get('text', '')
    answer = ask_llama(text)
    return jsonify({'answer': answer})

@app.route("/")
def home():
    test = getTime()
    return "This is the MindMender Backend."+ test

# Start the Flask server in a new thread
threading.Thread(target=app.run, kwargs={"use_reloader": False}).start()

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken
··········
 * ngrok tunnel "https://7b78-34-105-88-98.ngrok-free.app" -> "http://127.0.0.1:5000/"
 * Serving Flask app '__main__'
