In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

# Data Ingestion

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    github_url: str
    linkedin_url: str
    local_data_file: Path
    save_dir: Path

In [6]:
from bot.constants import *
from bot.utils.common import read_yaml,create_directories

In [7]:
type(CONFIG_PATH)

pathlib.WindowsPath

In [8]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_PATH,
            params_path = PARAMS_PATH
        ):
        self.config = read_yaml(config_path)
        self.param = read_yaml(params_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            github_url=config.github_url,
            linkedin_url=config.linkedin_url,
            local_data_file=config.local_data_file,
            save_dir=config.save_dir
        )

        return data_ingestion_config

In [9]:
import os
import requests
from bot import logger
from dotenv import load_dotenv
load_dotenv()
import json
import zipfile
from urllib import request

In [10]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config
    
    def get_repos(self):
        logger.info(f"Starting writing data from git hub repos")
        url = self.config.github_url
        headers = {"Authorization": f"token {os.getenv('TOKEN')}"}
        repos = requests.get(url, headers=headers).json()
        save_dir = self.config.save_dir
        with open(os.path.join(save_dir,"data.json"),'w',encoding='utf-8') as f:
            json.dump(repos,f,indent=4)
        logger.info(f"Completed writing data from git hub repos")
    
    def downloadFiles(self):
        if not os.path.exists(self.config.local_data_file):
            filename , headers = request.urlretrieve(
                url = self.config.linkedin_url,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"file already exists of size")
    
    def extractZip(self):
        unzip_path = self.config.save_dir
        os.makedirs(unzip_path,exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.get_repos()
    data_ingestion.downloadFiles()
    data_ingestion.extractZip()
except Exception as e:
    raise e

[2025-03-17 19:40:41,451: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-17 19:40:41,460: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-17 19:40:41,466: INFO: common: created directory at: artifacts]
[2025-03-17 19:40:41,472: INFO: common: created directory at: artifacts/data_ingestion]
[2025-03-17 19:40:41,475: INFO: 1269584388: Starting writing data from git hub repos]


[2025-03-17 19:40:42,148: INFO: 1269584388: Completed writing data from git hub repos]
[2025-03-17 19:40:42,152: INFO: 1269584388: file already exists of size]


# Data Transformation

In [12]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [13]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTrasformationConfig:
    root_dir: Path
    file_dir: Path
    save_dir: Path

In [14]:
from bot.constants import *
from bot.utils.common import read_yaml,create_directories

In [15]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_PATH,
            params_path = PARAMS_PATH
        ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self)->DataTrasformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_trasformation_config = DataTrasformationConfig(
            root_dir=config.root_dir,
            file_dir=config.file_dir,
            save_dir=config.save_dir,
        )

        return data_trasformation_config
    
    

In [16]:
import os
from reportlab.lib.pagesizes import letter,A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Preformatted
from bot import logger
import json
import markdown2
import html2text
import base64
import shutil
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
import json
from datetime import datetime

In [18]:
with open(os.path.join("artifacts/data_ingestion",'data.json'),'r',encoding='utf-8') as f:
        data = json.load(f)

sorted_repos = sorted(data, 
                      key=lambda repo: datetime.strptime(repo["updated_at"], "%Y-%m-%dT%H:%M:%SZ"), 
                      reverse=True)
# sorted_repos

In [19]:
class DataTransformation:
    def __init__(self,config:DataTrasformationConfig):
        self.config = config

    def get_data(self):
        file_dir = self.config.file_dir
        for file_name in os.listdir(file_dir):
            if file_name.lower().endswith(".pdf"):  # Check if file is a PDF
                source_path = os.path.join(file_dir, file_name)
                destination_path = os.path.join(self.config.save_dir, file_name)
                shutil.copy2(source_path, destination_path)
        
        with open(os.path.join(file_dir,'data.json'),'r',encoding='utf-8') as f:
            data = json.load(f)
        return data
    
    def get_last_updated(self):
        file_dir = self.config.file_dir
        with open(os.path.join(file_dir,'data.json'),'r',encoding='utf-8') as f:
            data = json.load(f)
        sorted_repos = sorted(data, 
                      key=lambda repo: datetime.strptime(repo["updated_at"], "%Y-%m-%dT%H:%M:%SZ"), 
                      reverse=True)
        return sorted_repos
    
    def get_lang(self,repo_name):
        url = f'https://api.github.com/repos/appsbotta/{repo_name}/languages'
        headers = {"Authorization": f"token {os.getenv('TOKEN')}"}
        lang = requests.get(url,headers=headers).json()
        top_three_keys = sorted(lang, key=lang.get, reverse=True)[:3]
        # logger.info(f"Got top 3 languages used in {repo_name}")
        return top_three_keys
    
    def get_timings(self,repo):
        update = repo['updated_at'][:10]
        created = repo['created_at'][:10]
        pushed = repo['pushed_at'][:10]
        times = {
            "updated_at": update,
            "created_at":created,
            "pushed_at": pushed,
        }
        # logger.info(f"secured 3 timings {times.keys()}")
        return times
    
    def get_readme(self,OWNER,REPO):
        # logger.info(f"requeting readme file for the repo {REPO} ")
        readme = f"https://api.github.com/repos/{OWNER}/{REPO}/readme"
        headers = {"Authorization": f"token {os.getenv('TOKEN')}"}
        response = requests.get(readme,headers=headers)
        readme_content = "No Readme FIle"
        if response.status_code == 200:
            data = response.json()
            readme_content = base64.b64decode(data["content"]).decode("utf-8")
            readme_content =  markdown2.markdown(readme_content, extras=["strip"])
            readme_content = html2text.html2text(readme_content)
        # logger.info(f"Request for README file for the repo {REPO} completed")
        return readme_content
            
    def get_pdf_from_data(self):
        logger.info("Saving data to a pdf file started")
        save_dir = self.config.save_dir
        data = self.get_data()

        doc = SimpleDocTemplate(os.path.join(save_dir,'repo.pdf'), pagesize=A4)
        doc1 = SimpleDocTemplate(os.path.join(save_dir,'last.pdf'), pagesize=A4)
        styles = getSampleStyleSheet()
        custom_style = ParagraphStyle(
            'Custom',
            parent=styles['Normal'],
            fontName='Courier',
            fontSize=10,
            leading=14,  # Line height
            spaceAfter=5,
        )
        heading_style = ParagraphStyle(
            "HeadingStyle",
            parent=styles["Heading1"],
            fontName="Courier-Bold",
            fontSize=14,
            leading=22,

        )

        content = []
        
        sorted_repos = self.get_last_updated()
        for i,repo in enumerate(sorted_repos):
            time = datetime.strptime(repo["updated_at"], "%Y-%m-%dT%H:%M:%SZ")
            text = f"{repo['name']} last Updated at {time}"
            paragraph = Paragraph(text,custom_style)
            content.append(paragraph)
        doc1.build(content)
        content = []


        for i,repo in enumerate(data):
            text = f"{i+1}. <b>{repo['name']}</b>  --->   {repo['html_url']} \n"
            # text = text + str(i+1) + ". " +  str(repo["name"]) + " -> " + str(repo["html_url"]) +"\n"
            paragraph = Paragraph(text,custom_style)
            content.append(paragraph)
        content.append(Spacer(1,5))
        
        for i,repo in enumerate(data):
            OWNER = repo['owner']['login']
            REPO = repo['name']
            readme = self.get_readme(OWNER,REPO)
            timings = self.get_timings(repo)
            lang = self.get_lang(REPO)
            heading = Paragraph(REPO,heading_style)
            content.append(heading)
            for key,value in timings.items():
                text = f"{key} -> {value}"
                paragraph = Paragraph(text,custom_style)
                content.append(paragraph)
            lang_text = "   "
            for it in lang:
                lang_text = lang_text + it +", "
            text = Paragraph(lang_text,custom_style)
            content.append(text)
            readme = Preformatted(readme,custom_style)
            content.append(readme)
            content.append(Spacer(1, 5))


        doc.build(content)
        logger.info("saving data to a pdf file is completed")
    
    
        

In [20]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_pdf_from_data()
except Exception as e:
    raise e

[2025-03-17 19:41:16,177: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-17 19:41:16,183: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-17 19:41:16,188: INFO: common: created directory at: artifacts]
[2025-03-17 19:41:16,194: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-17 19:41:16,198: INFO: 538437088: Saving data to a pdf file started]
[2025-03-17 19:41:38,885: INFO: 538437088: saving data to a pdf file is completed]


# Bot

In [21]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [42]:
import os
os.chdir('../')
%pwd

'c:\\Users\\Appsb\\Desktop'

In [22]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("artifacts/data_transformation")
docs = loader.load()
docs

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-03-17T19:41:16+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-03-17T19:41:16+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'artifacts\\data_transformation\\last.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Portifolio last Updated at 2025-03-17 10:53:09\nAI_Assistant last Updated at 2025-03-17 06:23:46\nDataSets last Updated at 2025-03-16 16:15:56\nText-Summarizer last Updated at 2025-03-16 14:30:46\nLlamaindex last Updated at 2025-03-14 14:54:48\nCrewAI last Updated at 2025-03-13 10:41:10\nLangChain last Updated at 2025-03-12 07:26:04\nQ-A_ChatBot_with_Implementation last Updated at 2025-03-11 14:50:44\nAgenticAi last Updated at 2025-03-10 01:52:18\nAWS_BEDROCK last Updated at 2025-03-07 03:27:03\nKidney_diease_end_to_end last Updated at 2025-03-05 13:55:46\nDL-End_to_End la

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 10000,chunk_overlap=100)
documents = text_splitter.split_documents(docs)
documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-03-17T19:41:16+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-03-17T19:41:16+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'artifacts\\data_transformation\\last.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Portifolio last Updated at 2025-03-17 10:53:09\nAI_Assistant last Updated at 2025-03-17 06:23:46\nDataSets last Updated at 2025-03-16 16:15:56\nText-Summarizer last Updated at 2025-03-16 14:30:46\nLlamaindex last Updated at 2025-03-14 14:54:48\nCrewAI last Updated at 2025-03-13 10:41:10\nLangChain last Updated at 2025-03-12 07:26:04\nQ-A_ChatBot_with_Implementation last Updated at 2025-03-11 14:50:44\nAgenticAi last Updated at 2025-03-10 01:52:18\nAWS_BEDROCK last Updated at 2025-03-07 03:27:03\nKidney_diease_end_to_end last Updated at 2025-03-05 13:55:46\nDL-End_to_End la

In [24]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(documents=documents,embedding=OpenAIEmbeddings())

[2025-03-17 19:42:12,077: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]
[2025-03-17 19:42:13,051: INFO: loader: Loading faiss.]
[2025-03-17 19:42:13,259: INFO: loader: Successfully loaded faiss.]
[2025-03-17 19:42:13,276: INFO: __init__: Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes.]


In [25]:
query = "list all projects"
res = db.similarity_search(query)
res

[2025-03-17 19:42:17,812: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]


[Document(id='f2a3ab13-6312-485b-b1c3-be6320eb4266', metadata={'producer': 'Microsoft® Excel® 2016', 'creator': 'Microsoft® Excel® 2016', 'creationdate': '2025-03-16T21:36:55+05:30', 'author': 'Lokesh Apparao Botta', 'moddate': '2025-03-16T21:36:55+05:30', 'source': 'artifacts\\data_transformation\\Skills.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Skills\nEnd-to-End Project Management\nContinuous Integration and Continuous Delivery (CI/CD)\nAmazon Web Services (AWS)\nMLOps\nArtificial Intelligence (AI)\nLarge Language Models (LLM)\nAgentic AI\nGoogle Gemini\nGenerative AI\nLangChain\nNatural Language Processing (NLP)\nData Science\nTensorFlow\nDeep Learning\nMachine Learning\nPython (Programming Language)\nObject-Oriented Programming (OOP)\nSQL\nDjango\npandas\nNumPy\nScikit-Learn\nSoftware Development\nData Structures\nAlgorithms\nMatplotlib\nC++\nReact.js\nComputer networks\nNode.js\nCommunication\nProject Management'),
 Document(id='8e637168-9460-42af-914c-

In [26]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [27]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import mlflow
llm = ChatOpenAI(model='gpt-3.5-turbo')

In [34]:
prompt = ChatPromptTemplate.from_template(
    """
You are Lokesh's personal AI assistant. You have been provided with a context that includes detailed information about Lokesh’s projects (project names, project URLs, updated_at, created_at, and pushed_at timestamps) as well as his personal details such as education, skills, and work experience.

When a user asks a question related to Lokesh—whether it's about his projects, education, skills, or work experience—follow these guidelines:

Use Only Provided Information:
Answer strictly based on the context provided. Do not assume or add any external information.

Detail and Accuracy:
Extract and present accurate details from the context. For projects, include names, URLs, and date details as available.

Step-by-Step Reasoning (if needed):
If the question requires it, explain your reasoning by referring to the specific fields in the context.

Insufficient Data Response:
If the context does not have enough information to answer the question, reply exactly with:
"Sorry, I don't have access to that data yet."

Now, answer the user's question based solely on the provided context.

<context>
{context}
</context>
question:{input}
"""
)


In [35]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(
    llm,
    prompt,
)


In [36]:
retriever = db.as_retriever(search_kwargs={"k": 30})
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000019D96F936D0>, search_kwargs={'k': 30})

In [37]:
from langchain.chains import create_retrieval_chain
retriever_chain = create_retrieval_chain(retriever,document_chain)
retriever_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000019D96F936D0>, search_kwargs={'k': 30}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\nYou are Lokesh\'s personal AI assistant. You have been provided with a context that includes detailed information about Lo

In [13]:
def model(input_df):
  answer = []
  for index, row in input_df.iterrows():
      answer.append(retriever_chain(row["questions"]))

  return answer

In [None]:
import pandas as pd

In [None]:
eval_df = pd.DataFrame(
  {
      "questions": [
          "who are you?",
          "who is lokesh?",
          "what is the highest education done by lokesh",
          "what experience does lokesh have",
      ],
  }
)

In [None]:
from mlflow.metrics.genai import EvaluationExample, faithfulness

# Create a good and bad example for faithfulness in the context of this problem
faithfulness_examples = [
  EvaluationExample(
      input="What is latest project lokesh worked on?",
      output="The latest project Lokesh worked on is the 'StockPrediction' project. This project was updated, created, and pushed to GitHub on August 24, 2023.",
      score=2,
      justification="The output does not provides a working solution, provided in the context.",
      grading_context={
          "context": """
          Portifolio updated_at -> 2025-03-17
          StockPrediction updated_at -> 2023-08-24
          """
      },
  ),
  EvaluationExample(
      input="What is latest project lokesh worked on?",
      output="The latest project Lokesh worked on is the 'Portifolio' project. This project was updated, created, and pushed to GitHub on August 24, 2023.",
      score=5,
      justification="The output provides a solution that is using the 18. Portifolio Section that is provided in the context.",
      grading_context={
          "context": """
          Portifolio updated_at -> 2025-03-17
          StockPrediction updated_at -> 2023-08-24
          """
      },
  ),
]

faithfulness_metric = faithfulness(model="openai:/gpt-4", examples=faithfulness_examples)
print(faithfulness_metric)

In [38]:
res = retriever_chain.invoke({"input":"what are his 2 latest projects"})
print(res['answer'])

[2025-03-17 20:08:40,714: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]
[2025-03-17 20:08:46,880: INFO: _client: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"]
Lokesh's 2 latest projects based on the provided context are:

1. **Portifolio**
   - Project URL: https://github.com/appsbotta/Portifolio
   - Last Updated at: 2025-03-17

2. **AI_Assistant**
   - Project URL: https://github.com/appsbotta/AI_Assistant
   - Last Updated at: 2025-03-17

These are the 2 most recently updated projects in Lokesh's portfolio.


In [29]:
import requests

TOKEN = os.getenv("TOKEN")
url = "https://api.github.com/user/repos?per_page=100"

headers = {"Authorization": f"token {TOKEN}"}
repos = requests.get(url, headers=headers).json()

# for repo in repos:
#     print(repo)
repos[1]

{'id': 945554174,
 'node_id': 'R_kgDOOFwC_g',
 'name': 'AI_Assistant',
 'full_name': 'appsbotta/AI_Assistant',
 'private': False,
 'owner': {'login': 'appsbotta',
  'id': 75985363,
  'node_id': 'MDQ6VXNlcjc1OTg1MzYz',
  'avatar_url': 'https://avatars.githubusercontent.com/u/75985363?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/appsbotta',
  'html_url': 'https://github.com/appsbotta',
  'followers_url': 'https://api.github.com/users/appsbotta/followers',
  'following_url': 'https://api.github.com/users/appsbotta/following{/other_user}',
  'gists_url': 'https://api.github.com/users/appsbotta/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/appsbotta/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/appsbotta/subscriptions',
  'organizations_url': 'https://api.github.com/users/appsbotta/orgs',
  'repos_url': 'https://api.github.com/users/appsbotta/repos',
  'events_url': 'https://api.github.com/users/appsbotta/events{/privac

In [None]:
import requests

# TOKEN = os.getenv("TOKEN")
url = 'https://api.github.com/repos/appsbotta/ImageCaption/languages'

headers = {"Authorization": f"token {TOKEN}"}
lang = requests.get(url).json()

# for repo in repos:
#     print(repo)
top_three_keys = sorted(lang, key=lang.get, reverse=True)[:3]
top_three_keys

In [None]:
readme = f"https://api.github.com/repos/appsbotta/LangChain/readme"
response = requests.get(readme)
response.json()

# Agents

In [6]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [9]:
import os
os.chdir('AI_Assistant')
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

[Document(metadata={'source': 'https://www.linkedin.com/in/lokesh5489/', 'language': 'No language found.'}, page_content='\n\n\n')]

In [None]:
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(api_key=os.getenv('OPENAI_API_KEY'),temperature=0)
