In [1]:
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader, CSVLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

In [2]:
## Quantization config to reduce the size of the model
quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                # bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
                )

## Load model
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    low_cpu_mem_usage=True,
    # quantization_config=quantization_config,
    device_map="auto", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)

## Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def get_paper_docs():
    path = 'papers'
    loader = PyPDFDirectoryLoader(path)
    documents = loader.load()
    return documents

In [10]:
chunk_size = 10000
chunk_overlap = 100
documents = get_paper_docs()

In [12]:
for doc in documents:
    print(doc.metadata['source'])
    ## Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents([doc])
    
    ## Create prompt to pass to llm
    query = f"### INSTRUCTION: Create labels from the given text indicating the type of content. Generate 5 labels that most represent the file content. \
    Output the results as a python list. ### INPUT: {texts[0]}"

    messages = [
        {"role": "user", "content": query}
    ]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 50,
        "return_full_text": False,
        # "temperature": 0.0,
        "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    print(output[0]['generated_text'])

papers\big data proposal.pdf
 ```python

["Big Data Pipeline", "Real-Time Stock Market Analysis", "Yahoo Finance Data", "Apache Kafka", "Apache Spark"]

```
papers\Freyr Report.pdf
 ```python
[
    "Serverless Computing",
    "Resource Management",
    "Freyr Serverless Resource Manager",
    "Deep Reinforcement Learning",
    "Performance Optimization"
]

