In [25]:
import torch
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import pandas as pd
from transformers import LlamaForSequenceClassification
import json
import tempfile
import os

In [4]:
#Testing applications document extraction method
def _get_page_contents_from_pdf_file_path(pdf_path):
    pdf_loader = PyPDFLoader(pdf_path)
    raw_documents= pdf_loader.load()
    return raw_documents

def _get_page_contents_from_pdf_in_memory(pdf_bytes):
    # Write bytes to a temporary file
    with tempfile.NamedTemporaryFile(delete=False,suffix='.pdf') as temp_pdf_file:
        temp_pdf_file.write(pdf_bytes)
        temp_pdf_file_path = temp_pdf_file.name
    # Load temp file
    pdf_loader = PyPDFLoader(temp_pdf_file_path)
    raw_documents= pdf_loader.load()
    # Delete temporary file
    os.remove(temp_pdf_file_path)
    return raw_documents

In [5]:
def read_docs(pdf_file, from_file_path=True):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    if from_file_path:
        data = _get_page_contents_from_pdf_file_path(pdf_file)
    else:
        data = _get_page_contents_from_pdf_in_memory(pdf_file)


    documents = text_splitter.split_documents(data)
    return documents

In [6]:
file_path = "C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerator/assets/data/Security Evidence Docs/SOC 2/GoogleCloud/Audit-Reports-1720774833381-81ba2e/GCP-[FALL-2023] GCP SOC 2..pdf"
documents = read_docs(file_path)

In [9]:
documents_metadata = []
for i, document in enumerate(documents):
    metadata = document.__dict__
    metadata["id"] = i
    metadata["embedding_id"] = i
    documents_metadata.append(metadata)

In [11]:
with open("documents.json", "w") as f:
    json.dump(documents_metadata, f, indent=4, sort_keys=True)

In [12]:
# Load the JSON data from the file
with open('documents.json') as f:
    data = json.load(f)

# Create a list to store the documents
documents = []

# Iterate over the JSON data and extract the relevant information
for doc in data:
    documents.append({
        'id': doc['id'],
        'metadata_page': doc['metadata']['page'],
        'metadata_source': doc['metadata']['source'],
        'page_content': doc['page_content'],
        'type': doc['type']
    })

# Create a Pandas DataFrame from the list of documents
dataset = pd.DataFrame(documents)

print(dataset.head())  # Print the first few rows of the DataFrame

   id  metadata_page                                    metadata_source  \
0   0              0  C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerato...   
1   1              1  C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerato...   
2   2              1  C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerato...   
3   3              1  C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerato...   
4   4              2  C:/Users/ansutton/Desktop/TPRM/TPRM-Accelerato...   

                                        page_content      type  
0  System and Organization Controls (SOC) 2 Type ...  Document  
1  Table of Contents  \nSECTION I - Google's Mana...  Document  
2  E. Proced ures  .................................  Document  
3  SECTION V - Other Information Provided by Goog...  Document  
4  1 SECTION I - Google's Management Assertion  \...  Document  


In [26]:
llm_transformers = LlamaForSequenceClassification.from_pretrained("llama")

OSError: llama is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [21]:
# Prepare the dataset for training
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Use the llm model to tokenize and prepare the input
        input_ids = llm.encode(text, return_tensors="pt")
        attention_mask = llm.encode(text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

# Create the TextDataset object
train_dataset = TextDataset(dataset["page_content"])

In [24]:
# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the desired device
llm.to(device)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(llm.parameters(), lr=1e-5)

for epoch in range(5):
    llm.train()
    total_loss = 0
    for batch in train_dataset:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        optimizer.zero_grad()

        outputs = llm(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, outputs)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataset)}")

llm.eval()

AttributeError: 'ChatOllama' object has no attribute 'to'