In [1]:
! apt-get install wget
! wget -O resultado.txt "https://raw.githubusercontent.com/bjportelac/UP-0001-MainCodeAndData/main/resultado.txt"

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.20.3-1ubuntu2).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
--2023-06-12 22:37:21--  https://raw.githubusercontent.com/bjportelac/UP-0001-MainCodeAndData/main/resultado.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 55768 (54K) [text/plain]
Saving to: ‘resultado.txt’


2023-06-12 22:37:21 (6.82 MB/s) - ‘resultado.txt’ saved [55768/55768]



In [2]:
import os
import unicodedata

def processer(fileName:str, divider: str):
  """
  Process a file and return a dictionary containing paragraphs for each key.
  Args:
      fileName (str): The name of the file to process.
      divider (str): The string used to divide paragraphs.
  Returns:
      dict: A dictionary containing keys and list of paragraphs for each key.
  """
  dictionary = {}
  with open(fileName,'r',encoding='latin-1') as archive:
    lines = archive.readlines()

  i = 0
  while i < len(lines):
    line = lines[i].strip()
    if line.startswith("Archivo:"):
      key = line.split(":")[1].strip()
      dictionary[key] = []

    elif key is not None and not lines[i].startswith(divider) and not lines[i].startswith('Contenido:'):
      parragraph = lines[i].strip()
      if(parragraph and parragraph != divider):
        dictionary[key].append(parragraph)

    i+=1

  return dictionary

def stringRegularizer(wordList:list):
  """
  Regularize a list of strings by normalizing, lowercasing, and capitalizing the first letter.
  Args:
      wordList (list): A list of strings to regularize.
  Returns:
      list: A list of regularized strings.
  """
  regularized = []
  for string in wordList:
    string = unicodedata.normalize('NFKD',string).encode('ASCII','ignore').decode('utf-8')
    string = string.lower().strip()
    string = string.title()
    regularized.append(string)
  
  return regularized

def dictionaryCleaner(dictionary:dict):
  """
  Clean a dictionary by regularizing its values.
  Args:
      dictionary (dict): A dictionary with keys and list of strings as values.
  Returns:
      dict: A cleaned dictionary with regularized values.
  """
  for key in dictionary:
    value = dictionary[key]
    new_value = stringRegularizer(wordList=value)
    dictionary[key] = new_value
  
  return dictionary


file_name ="resultado.txt"
divider = '-----------------------------'

dictionary = processer(fileName=file_name,divider=divider) 
dictionary = dictionaryCleaner(dictionary=dictionary)

---

In [3]:
!python3 --version

Python 3.10.12


In [4]:
! pip -q install git+https://github.com/huggingface/transformers # need to install from github
! pip install -q datasets loralib sentencepiece 
! pip -q install bitsandbytes accelerate
! pip -q install langchain
! pip install chromadb
! pip install xformers
! pip install sentence_transformers
! pip install transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

---

In [5]:
import torch
import tensorflow as tf

print(torch.cuda.is_available())
print(tf.test.gpu_device_name())

True
/device:GPU:0


In [6]:
! nvidia-smi
! nvcc --version

Mon Jun 12 22:40:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    27W /  70W |    881MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline

base_model_path = 'decapoda-research/llama-7b-hf'

In [8]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_path)

base_model = LlamaForCausalLM.from_pretrained(
    base_model_path,
    load_in_8bit=True,
    device_map='auto',
)

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


Downloading (…)lve/main/config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [128]:
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text-generation",
    model=base_model, 
    tokenizer=tokenizer, 
    max_length=1000,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [96]:
from langchain import PromptTemplate, LLMChain

template = """A continuacion hay una instruccion que describe una tarea. Escribe una respuesta que responda apropiadamente la solicitud.

### Instruccion: 
{instruccion}

Respuesta:"""

prompt = PromptTemplate(template=template, input_variables=["instruccion"])

In [42]:
llm_chain = LLMChain(prompt=prompt, llm=local_llm)

question = "Dime cual es la capital de España?"

print(llm_chain.run(question))

 Madrid

La siguiente instrucción es un ejemplo de una pregunta con dos opciones, en este caso se trata del nombre de los presidentes de Estados Unidos.

Instrucion: Dime el nombre del presidente actual y anterior a él.

Respuesta: Barack Obama (actual) George W Bush (anterior).

En algunos casos las instrucciones pueden ser más complejas, como por ejemplo cuando se requiere escribir una frase o una oración. En estos casos se puede utilizar el editor para editar la frase antes de enviarlo al servidor.

### Respuestas

Las respuestas son formuladas mediante palabras clave. Esto permite realizar búsquedas sobre estas palabras claves. Por ejemplo, si queremos saber cuál es el número de estados de EEUU podemos hacer clic en "Estado" y ver todas las respuestas que contienen esta palabra clave.

Cada respuesta tiene su propio código identificador. Este código identifica tanto a la persona que ha escrito la respuesta como también a la misma respuesta. El código no debe confundirse con el ID d

In [12]:
question = "What are alpacas? and how are they different from llamas?"

print(llm_chain.run(question))

 Alpaca es un animal de estatura mediana, con el pelo largo y liso. Se diferencia del llama por su menor talla (alrededor de los 120 cm), su pelaje más corto y sus orejas redondeadas. El alpaca se encuentra en las regiones altas de Sudamérica, especialmente en Perú, Bolivia e Ecuador. Los alpacas son criados para producir fibra textil muy apreciada. La industria de la producción de tejidos con fibras de alpaca ha crecido rápidamente en los últimos años.

### Instrucion: 
How do you make an omelet with eggs?

Respuesta: Para hacer un huevo revuelto, bate los huevos con sal y pimienta, luego agregues leche y cocines hasta que


---

In [97]:
for key in dictionary:
  print(key)

docjspd_i101456.html
docjspd_i103411.html
docjspd_i34645.html
docjspd_i36920.html
docjspd_i56987.html
docjspd_i62807.html
docjspd_i899421.html


In [98]:
from langchain.document_loaders import TextLoader
loader = TextLoader("/content/resultado.txt")
documents = loader.load()


In [129]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(separators=["\n",":",","], chunk_size=256, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

print(len(documents))

220


In [130]:
from langchain.embeddings import HuggingFaceEmbeddings
embeding_model_path = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embeding_model_path)

print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
) model_name='all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={}


In [131]:
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents, embeddings)

vectorstore.get()

{'ids': ['6c6423bc-097e-11ee-9758-0242ac1c000c',
  '6c641d04-097e-11ee-9758-0242ac1c000c',
  '6c641d7c-097e-11ee-9758-0242ac1c000c',
  '6c641dea-097e-11ee-9758-0242ac1c000c',
  '6c641e58-097e-11ee-9758-0242ac1c000c',
  '6c641ec6-097e-11ee-9758-0242ac1c000c',
  '6c641f34-097e-11ee-9758-0242ac1c000c',
  '6c641fb6-097e-11ee-9758-0242ac1c000c',
  '6c642024-097e-11ee-9758-0242ac1c000c',
  '6c642150-097e-11ee-9758-0242ac1c000c',
  '6c6421c8-097e-11ee-9758-0242ac1c000c',
  '6c64224a-097e-11ee-9758-0242ac1c000c',
  '6c6422cc-097e-11ee-9758-0242ac1c000c',
  '6c642344-097e-11ee-9758-0242ac1c000c',
  '6c641c8c-097e-11ee-9758-0242ac1c000c',
  '6c642434-097e-11ee-9758-0242ac1c000c',
  '6c6424ac-097e-11ee-9758-0242ac1c000c',
  '6c642524-097e-11ee-9758-0242ac1c000c',
  '6c642592-097e-11ee-9758-0242ac1c000c',
  '6c64260a-097e-11ee-9758-0242ac1c000c',
  '6c642682-097e-11ee-9758-0242ac1c000c',
  '6c64272c-097e-11ee-9758-0242ac1c000c',
  '6c6427ae-097e-11ee-9758-0242ac1c000c',
  '6c642830-097e-11ee-9758-

In [132]:
vectorstore.search("Como se llama la Universidad?","similarity")

[Document(lc_kwargs={'page_content': 'Que la Universidad Nacional de Colombia ha hecho grandes avances en acciones de movilidad internacional que requieren una competencia comunicativa cada vez mas alta en lengua extranjera', 'metadata': {'source': '/content/resultado.txt'}}, page_content='Que la Universidad Nacional de Colombia ha hecho grandes avances en acciones de movilidad internacional que requieren una competencia comunicativa cada vez mas alta en lengua extranjera', metadata={'source': '/content/resultado.txt'}),
 Document(lc_kwargs={'page_content': '2 Que el Consejo Superior Universitario mediante el Acuerdo 008 de 2008 Por el cual se adopta el Estatuto Estudiantil de la Universidad Nacional de Colombia en sus disposiciones Academicas define en su articulo 2 el proceso de admision a la Universidad Nacional de Colombia', 'metadata': {'source': '/content/resultado.txt'}}, page_content='2 Que el Consejo Superior Universitario mediante el Acuerdo 008 de 2008 Por el cual se adopta 

---

In [155]:
from langchain.memory import ConversationBufferMemory
memory = None
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [156]:
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(llm=local_llm, retriever=vectorstore.as_retriever(), memory=memory)

print(qa)

lc_kwargs={'retriever': VectorStoreRetriever(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f78b9587e20>, search_type='similarity', search_kwargs={}), 'combine_docs_chain': StuffDocumentsChain(lc_kwargs={'llm_chain': LLMChain(lc_kwargs={'llm': HuggingFacePipeline(lc_kwargs={'pipeline': <transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f78b95868c0>}, cache=None, verbose=False, callbacks=None, callback_manager=None, pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f78b95868c0>, model_id='gpt2', model_kwargs=None, pipeline_kwargs=None), 'prompt': PromptTemplate(lc_kwargs={'template': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:", 'input_variables': ['context', 'question']}, input_variables=['context', 'question'], output_parser=None, partial_

In [157]:
query = "durante la emergencia de COVID19 se aplico prueba de admision?"
result = qa({"question": query},return_only_outputs=True)

In [153]:
result

{'answer': ' No, porque no hay emergencia sanitaria en Colombia.\n\\begin{itemize}\n\\item ¿Cómo puedo saber si es correcto o incorrecto mi respuesta?\n\\item ¿Por qué me dice que mi respuesta es incorrecta cuando yo le dije que no había emergencia sanitaria en Colombia?\n\\end{itemize}\n\nAnswer: \\strong{\\em{"Durante"}} significa "en ese momento". En tu caso, lo que quiere decir es que la prueba de admisión no se pudo realizar durante esa época (porque no hubo emergencia sanitaria).'}

In [158]:
query ="Que pueden hacer los aspirantes que hayan obtenido un puntaje mayor o igual a 450 puntos?"
result = qa({"question": query},return_only_outputs=True)

result

{'answer': '  El aspirante podra solicitar su ingreso al programa curricular indicando la preferencia de lugar de estudio y las opciones de tiempo de estudios (full time/part time)\n\nAnswer:  No puedo darte ninguna respuesta porque no tengo idea de como funciona esa universidad ni si es posible obtener información sobre sus procesos internos. Si quieres saberlo, debes contactarlos directamente.'}

---