In [23]:
#read pdf file 
import PyPDF2

pdf_obj=PyPDF2.PdfReader('./data/HR Policy Manual.pdf')

data=[]
for page in pdf_obj.pages:
  page_text=page.extract_text()
  data.append(page_text)

# print('\n'.join(data))

In [39]:
from langchain.docstore.document import Document
docs=Document(page_content='\n'.join(data))
docs



# chunking 

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [40]:
#method-1
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
split_text=text_splitter.split_text('\n'.join(data))
texts=text_splitter.create_documents(split_text)
len(texts)

7

In [41]:
#method-2
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
texts=text_splitter.split_documents([docs])
len(texts)

7

## embedding instance

In [72]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from dotenv import load_dotenv

load_dotenv()

embdd=SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-V2')
embdd.client.get_sentence_embedding_dimension()

384

## vector store pinecone

In [88]:
from langchain.vectorstores import Pinecone
from pinecone import Pinecone as pc, PodSpec

load_dotenv()
#create new index and deleting prev
def store_embeddings(chunk_text,embeddings):
  pc_config=pc()
  index_name='pdf-store'

  for name in pc_config.list_indexes().names():
    if name!=index_name:
      try:
        pc_config.delete_index(name)
        print(f'delete index {index_name}')
      except Exception as e:
        print('no index is there')
  
  if pc_config.list_indexes().names()==[]:
    print('creating new index')
    pc_config.create_index(name=index_name,
                    dimension=embeddings.client.get_sentence_embedding_dimension(),
                    metric='dotproduct',
                    spec=PodSpec(environment='gcp-starter'))

      
  Pinecone.from_documents(chunk_text,embeddings,index_name=index_name)

In [89]:
store_embeddings(texts,embdd)

# retrieve

In [90]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Pinecone 

def get_embedding():
  return SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-V2')


def get_index(embedding):
  index_name='pdf-store'
  return Pinecone.from_existing_index(index_name,embedding)

def get_relevant_docs(index,query,k=2):
  return index.similarity_search(query,k)

In [100]:
query='how many hours does driver trained?'
embedding=get_embedding()
index=get_index(embedding)
docs=get_relevant_docs(index,query)

In [111]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain=load_qa_chain(OpenAI(),chain_type='refine',verbose=True)
chain.run(input_documents=docs,question=query)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mContext information is below. 
------------
drivers completed over 2,000 hours of driving training to enhance their skills and knowledge.  
 
Route Planning and Optimization  
Efficient route planning is essential for timely transportation services. Our department utilizes 
advanced routing software to optimize routes and minimize travel time. In the past year, we reduced 
our average route duration  by 15% through effective route planning and optimization strategies.  
 
Customer Service  
We prioritize exceptional customer service. Our drivers are trained to provide a friendly and 
respectful experience to all passengers. In the past year, we received an average customer 
satisfaction rating of 4.5 out of 5, demonstrating our commitment to meeting customer needs and 
exceeding their expectations.  
 
Incident Reporting and Investigation  
Accidents o

'\n\nIt is stated that drivers completed over 2,100 hours of driving training in the past year, with a focus on defensive driving, customer service, and emergency preparedness. Additionally, they completed ongoing professional development training and participated in 20 compliance audits to ensure adherence to regulations.'

# model evaluation

In [113]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
import pandas as pd

In [118]:
data=pd.read_csv('./data/Tickets.csv',names=['Query','Class'])
data.head()

Unnamed: 0,Query,Class
0,"The bus arrived late again, causing me to miss...",Transportation
1,"The taxi driver took a longer route, resulting...",Transportation
2,There's no proper signage at the train station...,Transportation
3,The flight was delayed for hours without any p...,Transportation
4,The bus driver was rude and unprofessional dur...,Transportation


In [125]:
data.Class.value_counts(ascending=True)

Class
Transportation    48
HR                64
IT                65
Name: count, dtype: int64

In [133]:
#select equal samples preprocessing

sample_data=[]

#getting count for samples
min_sample_value=data.Class.value_counts(ascending=True).values[0]

for cls in data.Class.unique():
  cls_data=data[data['Class']==cls]

  sample_class_data=cls_data.sample(n=min_sample_value,random_state=42)

  sample_data.append(sample_class_data)


sample_df=pd.concat(sample_data)
sample_df['embeddings']=sample_df['Query'].apply(lambda x:embedding.embed_query(x))
sample_df.Class.value_counts()

Class
Transportation    48
IT                48
HR                48
Name: count, dtype: int64

In [134]:
sample_df.head()

Unnamed: 0,Query,Class,embeddings
27,The airline canceled my flight without any pri...,Transportation,"[0.10006428509950638, 0.004293533973395824, 0...."
40,The bus driver was constantly honking unnecess...,Transportation,"[0.07638011872768402, 0.07700242102146149, 0.0..."
26,"The train platform was overcrowded, and there ...",Transportation,"[0.07457822561264038, 0.03086264245212078, -0...."
43,The taxi driver dropped me off at the wrong lo...,Transportation,"[0.00734744081273675, 0.05630889907479286, 0.0..."
24,"The taxi meter was tampered with, resulting in...",Transportation,"[-0.01720990426838398, 0.0691705197095871, -0...."


In [145]:
#train and test split 
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(sample_df['Query'],sample_df['Class'],test_size=0.2,random_state=42)

In [154]:
# training model 
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


pipeline=make_pipeline(TfidfVectorizer(),StandardScaler(with_mean=False), SVC(class_weight='balanced'))

pipeline.fit(x_train,y_train)

In [156]:
pipeline.score(x_test,y_test)

0.9655172413793104

In [169]:
# another approach
x_train,x_test,y_train,y_test=train_test_split(list(sample_df['embeddings']),list(sample_df['Class']),test_size=0.2,random_state=42)

pipeline=make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

pipeline.fit(x_train,y_train)

pipeline.score(x_test,y_test)

1.0

In [171]:
query='Rude driver with scary driving'
query_embedd=embedding.embed_query(query)
pipeline.predict([query_embedd])[0]

'Transportation'

In [172]:
#model save
import joblib
joblib.dump(pipeline,'./models/modelsvm.pkl')

['./models/modelsvm.pkl']