In [0]:
# INSTALLS
%pip install langchain accelerate transformers ctransformers PyPDF2 pymupdf unstructured[pdf] faiss-cpu -q
dbutils.library.restartPython()

Python interpreter will be restarted.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
qpd 0.4.4 requires antlr4-python3-runtime<4.12,>=4.11.1, but you have antlr4-python3-runtime 4.9.3 which is incompatible.
httpx 0.13.3 requires idna==2.*, but you have idna 3.6 which is incompatible.
botocore 1.34.34 requires urllib3<1.27,>=1.25.4; python_version < "3.10", but you have urllib3 2.2.0 which is incompatible.
boto3 1.21.18 requires botocore<1.25.0,>=1.24.18, but you have botocore 1.34.34 which is incompatible.
Python interpreter will be restarted.


In [0]:
# IMPORTS
import os
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain
import boto3
from PyPDF2 import PdfReader
from io import BytesIO
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader, S3FileLoader
from langchain.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter 
import pickle
from langchain.chains import RetrievalQA

In [0]:
MODEL_NAME = 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
MODEL_FILE_NAME = 'mistral-7b-instruct-v0.2.Q6_K.gguf'
MODEL_FOLDER_NAME = 'models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF'
MAX_NEW_TOKENS = 1024
CACHE_DIR = '/Workspace/Users/ajay.kumar@miqdigital.com/models'
TEMP = 0.2
MODEL_FILE_PATH = os.path.join(CACHE_DIR, MODEL_FOLDER_NAME)

### Load LLM

In [0]:
llm = CTransformers(
        model = MODEL_NAME,
        model_file = MODEL_FILE_NAME,
        max_new_tokens = MAX_NEW_TOKENS,
        temperature = TEMP
    )

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

mistral-7b-instruct-v0.2.Q6_K.gguf:   0%|          | 0.00/5.94G [00:00<?, ?B/s]

### Load Data

In [0]:
bucket_name ="prod-ai-and-automation"
item_name = "proj_pptrag_jiradump_ppts/inputdocs/processed_jiradump_docformat.pdf"
SOURCE_DIR = "/Workspace/Users/ajay.kumar@miqdigital.com/inputdocs/processed_jiradump.csv"

In [0]:
import pandas as pd
df = pd.read_csv(SOURCE_DIR)
log = pd.DataFrame(columns=['Question', 'Response'])



In [0]:
df.head()

Unnamed: 0.1,Unnamed: 0,Issue key,Description,Module_list,SubModule_list
0,0,CCM-5501,,,"[{'slide_number': 1, 'slide_text': 'Cavendish ..."
1,1,CCM-5500,,,"[{'slide_number': 1, 'slide_text': 'ATV POST-C..."
2,2,CCM-5427,"Hi team, \n\nRequest you to provide us with a ...",,"[{'slide_number': 1, 'slide_text': 'ATV POST-C..."
3,3,CCM-5425,Hey Team! Looking to receive the following in ...,,"[{'slide_number': 1, 'slide_text': 'POST - CAM..."
4,4,CCM-5424,"Hi team,\n\n\nRequest you to please help us wi...","['OLV', 'CTV']","[{'slide_number': 1, 'slide_text': 'EDITABLE T..."


### PROMPT TEMPLATE

In [0]:
template = """
You are a transcriber for my documents. I will give you the requirements of an Account Manager during an ad campaign, and the deliverables of the DnA Team in the form of a PPT. Your job is to transcribe the deliverables of the team in the sequence of slides for which the "slide_text" are mentioned.

The format of the ppt is like this : 'slide_number' and 'slide_text'.

Account Manager's requirement : {requirement}

PPT Deliverable : {deliverable}

For each 'slide_number' please summarize the 'slide_text'

"""

In [0]:
prompt = PromptTemplate(template=template, input_variables=['requirement','deliverable'])
chain = LLMChain(prompt=prompt, llm=llm)

### Generate narration

In [0]:
start_index = 1 # Starting row number in the csv file
end_index = 5 # Ending row number in the csv file

# Create lists for the required narrations.
manager_requests = [] 
deliverables = []

for index, row in df.iloc[start_index:end_index].iterrows():
  if pd.notna(row['Description']):
    manager_requests.append(row["Description"])
    deliverables.append(row["SubModule_list"])

In [0]:
# Iterate through the list and prompt the LLM to generate narration for 2 slides at a time.

for i in range(len(manager_requests)):
  d = deliverables[i]
  req = manager_requests[i]
  res = ''
  for j in range(0,len(d)-3, 2):
    response = chain.invoke({'requirement': req, 'deliverable': d[j:j+2]})
    res += response['text']
  new_log={'Question':f'{req}', 'Response': f'{res}'} 
  log.loc[len(log)] = new_log

[{'slide_number': 1, 'slide_text': 'ATV POST-CAMPAIGN INSIGHTS Maple Leaf\xa0\n18/09/2023 to 07/12/2023'}, {'slide_number': 2, 'slide_text': 'CTV\nPerformance'}, {'slide_number': 3, 'slide_text': 'CAMPAIGN\n Households reached\n Video Completion \nRate (VCR)\n 976,874\n 90,987\n 98%\n Advanced TV with ACR\n Impressions\n Performance Summary'}, {'slide_number': 4, 'slide_text': 'CTV Campaign Insights'}, {'slide_number': 5, 'slide_text': 'Incremental Reach for CTV The CTV campaign helped drive an additional 30.15%           %\xa0\nIncremental Reach on CTV\n\x0b LTV-only \nReach CTV + LTV Overlap *\x0b CTV Reach Only *\x0b Shared\n           %    63.56K 27.43K 69.85 30.15'}, {'slide_number': 6, 'slide_text': 'Potential Reach Comparison *Projected to Number of CTV HHs in Canada: 6.5M\n**Projected to CA LTV households: 7.7M (Source: Environics)          %\xa0\nIncremental Reach on CTV\n\x0b Projected LTV-only \nReach CTV + LTV Overlap *\x0b CTV Reach Only *\x0b Shared\n           %    3.95M

In [0]:
log.head()

Unnamed: 0,Question,Response
0,Request you to provide us with a FSA list of p...,Slide 1 : Maple Leaf post-campaign insights fo...
1,['Looking to receive the following in the Post...,Slide 1: The first slide of the PPT report is ...


In [0]:
# Save the responses to a csv file
log.to_csv('/Workspace/Users/ajay.kumar@miqdigital.com/logs/jira_log3.csv', index=False)