In [1]:
import pandas as pd
from pprint import pprint 
import json
import openai
import re


import langchain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings


import os
import pinecone


from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

  from tqdm.autonotebook import tqdm


True

In [2]:
# Define the path to the JSON file
file_path = "./Data/jira-conversations2.json"

# Read the JSON file with utf-8 encoding
with open(file_path, "r", encoding='utf-8') as f:
    data = json.load(f)

In [3]:
json_data = data[0:9]

In [4]:
def clean_text(text):
    text = ' '.join(text.split())  # Remove extra whitespaces and newlines
    
    # Remove different kinds of unwanted patterns from text
    # Remove unwanted patterns
    text = re.sub(r'~+\+~+', '', text)  # Remove ~+~+
    text = re.sub(r'\+\~', '', text)  # Remove +~
    text = re.sub(r'----', '', text)  # Remove ----
    text = re.sub(r'\+\+', '', text)    # Remove ++
    

    text = re.sub(r'~accountid:[a-zA-Z0-9]+', '', text)
    text = re.sub(r'\{"type".*?\}\]', '', text)
    text = re.sub(r'\{adf\}.*?\{adf\}', '', text)  # Remove "{adf} ... {adf}" and content within
    text = re.sub(r'\[~~email~~-?\d+~~\]', '', text)  # Remove `[~~email~~-numbers~~]`
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'<\[\[~~email~~-?\d+~~\]\]', '', text)  # Remove `<[[~~email~~-numbers~~]]>`
    text = re.sub(r'\|mailto:\[~~email~~-?\d+~~\]', '', text)  # Remove email tags
    text = re.sub(r'—-—-—-—', '', text)  # Remove "—-—-—-—"
    text = re.sub(r'\w{3}, \d{1,2} \w{3} \d{4}, \d{1,2}:\d{2} [apmAPM]{2}', '', text)  # Remove timestamps
    text = re.sub(r'\|\s+\|\s+You don\'t often get email from', '', text)  # Remove headers
    text = re.sub(r'\[Powered by Service Management.*?\]', '', text)  # Remove "[Powered by...]"
    text = re.sub(r'\[View request.*?&reserved=0\]', '', text)  # Remove "[View request...]"
    text = re.sub(r'\*\*\*Please reply above this line\*\*\*', '', text)  # Remove reply line
    text = re.sub(r'\|', '', text)  # Remove "|"
    text = re.sub(r'_', '', text)  # Remove "_"
    text = re.sub(r'\[mailto:\]', '', text)  # Remove "[mailto:]"
    text = re.sub(r'\[|\]', '', text)  # Remove "[" and "]"
    text = re.sub(r'<|>', '', text)  # Remove "<" and ">"
    text = re.sub(r'\*', '', text)  # Remove "*"
    text = re.sub(r'!jira[-a-zA-Z0-9 ()]+!', '', text)  # Remove Jira text
    
    return text

# Clean text in conversations
for conv in json_data:
    for key, value in conv.items():
        try:
            conv[key] = clean_text(value)
        except Exception as e:
            print(f"Error cleaning text in '{key}': {e}")

In [5]:
pprint(json_data)

[{'question000001': 'Hii This is  gadipally UID-U6331114. In process of '
                    'submitting DS160. I need to send address and phone number '
                    'of point of contact(school official).So can you please '
                    'send address and phone number of IRIS BRITO(School '
                    'official to contact uon arrival).Can i also know the '
                    'first and last name of IRIS BRITO. Thank you. ',
  'response000002': 'Thank you for reaching out. You are able to add  at . '
                    'Best, '},
 {'question000001': 'Hello, I am  with ID - U20309912. I have not received any '
                    'mail regarding academic integrity course . Can you please '
                    'send the link for academic integrity course and details '
                    'regarding it?! Thanks in advance. ',
  'question000003': 'Hello, USF mail Id is  Thanks in advance   ',
  'question000005': 'Hello , I didn’t get any canvas invite to my mail. C

In [6]:
# Import utility for splitting up texts and split up the explanation given above into document chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 0,
)

In [7]:
dic_sp = []
dic_key = []

for x in json_data:
    for key,value in x.items():
        dic_key.append(key)
        dic_sp.append(text_splitter.create_documents([x[key]]))

In [8]:
dic_key

['question000001',
 'response000002',
 'question000001',
 'response000002',
 'question000003',
 'response000004',
 'question000005',
 'response000006',
 'question000001',
 'response000002',
 'question000001',
 'response000002',
 'question000003',
 'response000004',
 'question000005',
 'question000001',
 'response000002',
 'question000003',
 'question000001',
 'response000002',
 'question000003',
 'response000004',
 'question000005',
 'response000006',
 'question000007',
 'response000008',
 'question000009',
 'response000010',
 'question000001',
 'response000002',
 'question000003',
 'response000004',
 'question000005',
 'response000006',
 'question000007',
 'response000008',
 'question000009',
 'response000010',
 'question000011',
 'response000012',
 'question000001',
 'response000002',
 'question000003',
 'response000004',
 'question000001',
 'response000002',
 'question000003',
 'response000004',
 'question000005']

In [9]:
# Assuming dic_sp contains the list of Document objects as described
vectors = []  # To store the vectors for each page_content
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002")

# Iterate through each list of Document objects
for doc_list in dic_sp[1:3]:
    # Iterate through each Document and extract the page_content
    for doc in doc_list:
        page_content = doc.page_content
        # Convert the page_content to a vector using embeddings
        vector = embeddings.embed_query(page_content)
        vectors.append(vector)

# Print the vectors
for i, vector in enumerate(vectors):
    print(f"Vector for chunk {i + 1}: {vector}")

Vector for chunk 1: [-0.0023755780147651753, -0.005913672900649822, -0.013774982494940574, -0.040597110519237534, -0.01498804410424756, 0.046797200498541386, -0.021579010163616623, -0.012083436223305677, -0.007716416758106941, 0.010102103347980593, 0.02776562018013071, 0.0038346210053084345, 0.00793207148269522, -0.026876043218843196, 0.008080335240899036, -0.015540661111338663, 0.008875563638528972, 0.016214582882376618, -0.0029383035969647547, -0.004720829372882355, -0.01049971754679556, 0.0030343376216323374, 0.01548674684811499, 0.01287192650218586, -0.017077205506019988, -0.01951680496213347, 0.017225468332901242, -0.02023116289625046, 0.012231699981509494, 0.0008247130804725125, 0.0003767650007192627, 0.006065305718228516, -0.02973347495531116, -0.026350382412041365, -0.012885404602330494, -0.01785895580350529, -0.007581632031370324, -0.0102975413880132, 0.031054364159742925, 0.006307917574428631, 0.04027362866518575, -0.0006103213456961907, 0.003955927166239133, -0.0217811853910

In [10]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [11]:
pinecone.delete_index("langchain-index")

In [12]:
pinecone.create_index("langchain-index", dimension=1536) # 1536 is openai ada embedding dimension
index_name = "langchain-index"

In [13]:
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002") #EXPENSIVE - - - USE CAREFULLY

In [14]:
print(f"Total number of Documents is: {len(dic_sp)}")

for i in range(len(dic_sp)):
    try:
        document = dic_sp[i]
        print(f"Processing document {i + 1}")
        search = Pinecone.from_documents(document, embeddings, index_name=index_name)
    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")


Total number of Documents is: 49
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document 20
Processing document 21
Processing document 22
Processing document 23
Processing document 24
Processing document 25
Processing document 26
Processing document 27
Processing document 28
Processing document 29
Processing document 30
Processing document 31
Processing document 32
Processing document 33
Processing document 34
Processing document 35
Processing document 36
Processing document 37
Processing document 38
Processing document 39
Processing document 40
Processing document 41
Processing document 42
Processing

In [15]:
# Do a simple vector similarity search
query = "I am writing this mail to inquire"

result = search.similarity_search(query)
result

[Document(page_content='Sir/Madam, Greetings of the day!!! This is , Thanks for giving the admission and i-20 for BAIS (Fall 2023) . I am writing this mail to inquire about the prerequisite courses that I need to take up before the start of my First class in Aug 2023 ? Can you please verify my profile and do the needful. Below are the details for your reference: -  USF ID: U58683362 Undergraduate Major : Computer Science Job Experience : Yes (Software Engineer) I look forward to hearing from you. ,'),
 Document(page_content='Hello, USF mail Id is  Thanks in advance'),
 Document(page_content='Hello, provide me with your USF email.'),
 Document(page_content='Hi  Can you please process my request as soon as possible. Regards, .')]

In [16]:
dic_sp

[[Document(page_content='Hii This is  gadipally UID-U6331114. In process of submitting DS160. I need to send address and phone number of point of contact(school official).So can you please send address and phone number of IRIS BRITO(School official to contact uon arrival).Can i also know the first and last name of IRIS BRITO. Thank you.', metadata={'text': 'Hii This is  gadipally UID-U6331114. In process of submitting DS160. I need to send address and phone number of point of contact(school official).So can you please send address and phone number of IRIS BRITO(School official to contact uon arrival).Can i also know the first and last name of IRIS BRITO. Thank you.'})],
 [Document(page_content='Thank you for reaching out. You are able to add  at . Best,', metadata={'text': 'Thank you for reaching out. You are able to add  at . Best,'})],
 [Document(page_content='Hello, I am  with ID - U20309912. I have not received any mail regarding academic integrity course . Can you please send the 