In [None]:
import os

In [None]:
%pwd

In [None]:
os.chdir('../')

In [None]:
%pwd

# Data Ingestion

In [106]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    github_url: str
    save_dir: Path

In [107]:
from bot.constants import *
from bot.utils.common import read_yaml,create_directories

In [108]:
type(CONFIG_PATH)

pathlib._local.WindowsPath

In [109]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_PATH,
            params_path = PARAMS_PATH
        ):
        self.config = read_yaml(config_path)
        self.param = read_yaml(params_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            github_url=config.github_url,
            save_dir=config.save_dir
        )

        return data_ingestion_config

In [110]:
import os
import requests
from bot import logger
from dotenv import load_dotenv
load_dotenv()
import json

In [111]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config
    
    def get_repos(self):
        logger.info(f"Starting writing data from git hub repos")
        url = self.config.github_url
        headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
        repos = requests.get(url, headers=headers).json()
        save_dir = self.config.save_dir
        with open(os.path.join(save_dir,"data.json"),'w',encoding='utf-8') as f:
            json.dump(repos,f,indent=4)
        logger.info(f"Completed writing data from git hub repos")

In [112]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.get_repos()
except Exception as e:
    raise e

[2025-03-10 23:22:08,838: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-10 23:22:08,842: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-10 23:22:08,846: INFO: common: created directory at: artifacts]
[2025-03-10 23:22:08,849: INFO: common: created directory at: artifacts/data_ingestion]
[2025-03-10 23:22:08,851: INFO: 3120334065: Starting writing data from git hub repos]
[2025-03-10 23:22:09,376: INFO: 3120334065: Completed writing data from git hub repos]


# Data Transformation

In [35]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [None]:
import os

In [None]:
os.chdir('../')

In [None]:
%pwd

In [36]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTrasformationConfig:
    root_dir: Path
    file_dir: Path
    save_dir: Path

In [37]:
from bot.constants import *
from bot.utils.common import read_yaml,create_directories

In [38]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_PATH,
            params_path = PARAMS_PATH
        ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self)->DataTrasformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_trasformation_config = DataTrasformationConfig(
            root_dir=config.root_dir,
            file_dir=config.file_dir,
            save_dir=config.save_dir,
        )

        return data_trasformation_config
    
    

In [None]:
import os
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Preformatted
from bot import logger
import json
import markdown2
import html2text
import base64

In [None]:
class DataTransformation:
    def __init__(self,config:DataTrasformationConfig):
        self.config = config

    def get_data(self):
        file_dir = self.config.file_dir
        with open(os.path.join(file_dir,'data.json'),'r',encoding='utf-8') as f:
            data = json.load(f)
        return data
    
    def get_lang(self,repo_name):
        url = f'https://api.github.com/repos/appsbotta/{repo_name}/languages'
        headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
        lang = requests.get(url,headers=headers).json()
        top_three_keys = sorted(lang, key=lang.get, reverse=True)[:3]
        # logger.info(f"Got top 3 languages used in {repo_name}")
        return top_three_keys
    
    def get_timings(self,repo):
        update = repo['updated_at'][:10]
        created = repo['created_at'][:10]
        pushed = repo['pushed_at'][:10]
        times = {
            "updated_at": update,
            "created_at":created,
            "pushed_at": pushed,
        }
        # logger.info(f"secured 3 timings {times.keys()}")
        return times
    
    def get_readme(self,OWNER,REPO):
        # logger.info(f"requeting readme file for the repo {REPO} ")
        readme = f"https://api.github.com/repos/{OWNER}/{REPO}/readme"
        headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
        response = requests.get(readme,headers=headers)
        readme_content = "No Readme FIle"
        if response.status_code == 200:
            data = response.json()
            readme_content = base64.b64decode(data["content"]).decode("utf-8")
            readme_content =  markdown2.markdown(readme_content, extras=["strip"])
            readme_content = html2text.html2text(readme_content)
        # logger.info(f"Request for README file for the repo {REPO} completed")
        return readme_content
    
    
    def get_pdf_from_data(self):
        logger.info("Saving data to a pdf file started")
        save_dir = self.config.save_dir
        data = self.get_data()

        doc = SimpleDocTemplate(os.path.join(save_dir,'repo.pdf'), pagesize=letter)
        styles = getSampleStyleSheet()
        custom_style = ParagraphStyle(
            'Custom',
            parent=styles['Normal'],
            fontName='Courier',
            fontSize=10,
            leading=14,  # Line height
            spaceAfter=5,
        )
        heading_style = ParagraphStyle(
            "HeadingStyle",
            parent=styles["Heading1"],
            fontName="Courier-Bold",
            fontSize=14,
            leading=22,

        )

        content = []



        for i,repo in enumerate(data):
            text = f"{i+1}. <b>{repo['name']}</b>  --->   {repo['html_url']} \n"
            # text = text + str(i+1) + ". " +  str(repo["name"]) + " -> " + str(repo["html_url"]) +"\n"
            paragraph = Paragraph(text,custom_style)
            content.append(paragraph)
        content.append(Spacer(1,5))
        
        for i,repo in enumerate(data):
            OWNER = repo['owner']['login']
            REPO = repo['name']
            readme = self.get_readme(OWNER,REPO)
            timings = self.get_timings(repo)
            lang = self.get_lang(REPO)
            heading = Paragraph(REPO,heading_style)
            content.append(heading)
            for key,value in timings.items():
                text = f"{key} -> {value}"
                paragraph = Paragraph(text,custom_style)
                content.append(paragraph)
            lang_text = "   "
            for it in lang:
                lang_text = lang_text + it +", "
            text = Paragraph(lang_text,custom_style)
            content.append(text)
            readme = Preformatted(readme,custom_style)
            content.append(readme)
            content.append(Spacer(1, 5))


        doc.build(content)
        logger.info("saving data to a pdf file is completed")
    
    
        

In [105]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_pdf_from_data()
except Exception as e:
    raise e

[2025-03-10 23:14:05,366: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-10 23:14:05,372: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-10 23:14:05,379: INFO: common: created directory at: artifacts]
[2025-03-10 23:14:05,387: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-10 23:14:05,389: INFO: 3422072289: Saving data to a pdf file started]
[2025-03-10 23:14:28,716: INFO: 3422072289: saving data to a pdf file is completed]


In [99]:
def get_readme(OWNER,REPO):
        readme = f"https://api.github.com/repos/{OWNER}/{REPO}/readme"
        headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
        response = requests.get(readme,headers=headers)
        readme_content = "No Readme FIle"
        print(response.status_code)
        if response.status_code == 200:
            data = response.json()
            readme_content = base64.b64decode(data["content"]).decode("utf-8")
            readme_content =  markdown2.markdown(readme_content, extras=["strip"])
            readme_content = html2text.html2text(readme_content)
        return readme_content

content = get_readme('appsbotta','DL-End_to_End')
content

200


"# DL-End _to_ End\n\n## Workflow\n\n  1. Update config.yaml in config dir\n  2. Update secrets.yaml [optional]\n  3. Update params.yaml\n  4. Update the entity\n  5. Update the configuration manager in src config\n  6. Update the components\n  7. Update the pipeline\n  8. Update the main.py\n  9. Update the dvc.yaml\n\n# To run the repo\n\n## steps\n\nClone the repository `bash https://github.com/appsbotta/DL-End_to_End `\n\n## step - 01 Create a conda env after opening the repo\n\n`bash conda create -n ckn python=3.13 -y ` `bash conda activate ckn `\n\n## step - 02 Install Requirements.txt\n\n`bash pip install -r requirements.txt `\n\n## step - 03 Run app.py\n\n`bash python app.py `\n\n## DVC cmd\n\n  * dvc init\n  * dvc repro\n  * dvc dag\n\n# AWS CI-CD Deployment using Github Actions\n\n## 1\\. Login to AWS Console\n\n## 2\\. Create IAM user for deployment\n\n```bash\n\n# give these access to IAM user\n\n  1. EC2 access -> this is virtual machine\n  2. ECR: Elastic COntainer regist

In [None]:
        # with open(os.path.join(file_path,'repo.txt'), "r", encoding="utf-8") as f:
        #     lines = f.readlines()
        #     for line in lines:
        #         if line.strip() == "<=================================================================================================>":
        #             paragraph = Preformatted(line.strip(), custom_style)
        #         else:
        #             paragraph = Paragraph(line.strip(), custom_style)     # Wrap text automatically
        #         content.append(paragraph)
        #         content.append(Spacer(1, 5))  # Preserve spacing

import os
import base64
import requests
import markdown2
import html2text
from bot import logger
from dotenv import load_dotenv
load_dotenv()
from bot.entity.config_entity import DataIngestionConfig

class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config
    
    def get_lang(self,repo_name):
        url = f'https://api.github.com/repos/appsbotta/{repo_name}/languages'
        lang = requests.get(url).json()
        top_three_keys = sorted(lang, key=lang.get, reverse=True)[:3]
        return top_three_keys
    
    def get_timings(self,repo):
        update = repo['updated_at'][:10]
        created = repo['created_at'][:10]
        pushed = repo['pushed_at'][:10]
        return [update,created,pushed]
    
    def get_readme(self,OWNER,REPO):
        readme = f"https://api.github.com/repos/{OWNER}/{REPO}/readme"
        response = requests.get(readme)
        readme_content = "No Readme FIle"
        if response.status_code == 200:
            data = response.json()
            readme_content = base64.b64decode(data["content"]).decode("utf-8")
            readme_content =  markdown2.markdown(readme_content, extras=["strip"])
            readme_content = html2text.html2text(readme_content)
        return readme_content

    def get_repos(self):
        url = self.config.github_url
        headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
        repos = requests.get(url, headers=headers).json()
        save_dir = self.config.save_dir
        with open(os.path.join(save_dir,'repo.txt'),'w',encoding='utf-8') as f:
            f.write(str("\n"))
            for i,repo in enumerate(repos):
                OWNER = repo['owner']['login']
                REPO = repo['name']
                timings = self.get_timings(REPO)
                readme_content = self.get_readme(OWNER,REPO)
                f.write( str(i+1) + ". " +  str(repo["name"])+"\n")
                f.write("Description \n")
                f.write(readme_content)
                f.write(str("\n"))
                f.write(str("\n"))
                f.write("<=================================================================================================>")
                f.write(str("\n"))
                f.write(str("\n"))
            logger.info(f"Completed writing data from git hub repos")

# Bot

In [113]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [2]:
import os
os.chdir('../')
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [114]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("artifacts/data_transformation/repo.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-03-10T23:20:37+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-03-10T23:20:37+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'artifacts/data_transformation/repo.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content='1. AgenticAi ---> https://github.com/appsbotta/AgenticAi\n2. AI_Assistant ---> https://github.com/appsbotta/AI_Assistant\n3. AWS_BEDROCK ---> https://github.com/appsbotta/AWS_BEDROCK\n4. DataSets ---> https://github.com/appsbotta/DataSets\n5. DL-End_to_End ---> https://github.com/appsbotta/DL-End_to_End\n6. FaceDetection_React ---> https://github.com/appsbotta/FaceDetection_React\n7. FitnessTracker ---> https://github.com/appsbotta/FitnessTracker\n8. Gemini ---> https://github.com/appsbotta/Gemini\n9. ImageCaption ---> https://github.com/appsbotta/ImageCaption\n10. JosaaDat

In [115]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap=100)
documents = text_splitter.split_documents(docs)
documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-03-10T23:20:37+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-03-10T23:20:37+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'artifacts/data_transformation/repo.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content='1. AgenticAi ---> https://github.com/appsbotta/AgenticAi\n2. AI_Assistant ---> https://github.com/appsbotta/AI_Assistant\n3. AWS_BEDROCK ---> https://github.com/appsbotta/AWS_BEDROCK\n4. DataSets ---> https://github.com/appsbotta/DataSets\n5. DL-End_to_End ---> https://github.com/appsbotta/DL-End_to_End\n6. FaceDetection_React ---> https://github.com/appsbotta/FaceDetection_React\n7. FitnessTracker ---> https://github.com/appsbotta/FitnessTracker\n8. Gemini ---> https://github.com/appsbotta/Gemini\n9. ImageCaption ---> https://github.com/appsbotta/ImageCaption\n10. JosaaDat

In [116]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(documents=documents,embedding=OpenAIEmbeddings())

[2025-03-10 23:22:56,828: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]


In [117]:
query = "list all projects"
res = db.similarity_search(query)
res

[2025-03-10 23:23:06,951: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]


[Document(id='22a27696-a407-46cb-85c1-b80c22736833', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-03-10T23:20:37+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-03-10T23:20:37+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'artifacts/data_transformation/repo.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content='15. Networks ---> https://github.com/appsbotta/Networks\n16. OlX_CLONE ---> https://github.com/appsbotta/OlX_CLONE\n17. Portifolio ---> https://github.com/appsbotta/Portifolio\n18. Q-A_ChatBot_with_Implementation --->\nhttps://github.com/appsbotta/Q-A_ChatBot_with_Implementation\n19. Real-Time-chatting ---> https://github.com/appsbotta/Real-Time-chatting\n20. SentimentAnalysis ---> https://github.com/appsbotta/SentimentAnalysis\n21. SimpleDVC ---> https://github.com/appsbotta/SimpleDVC\n22. StockPrediction ---> https://github.com/a

In [118]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [119]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = ChatOpenAI(model='gpt-3.5-turbo')

In [120]:
prompt = ChatPromptTemplate.from_template(
"""
Answer the following question based only on the provided contex.
Think step by step before providing a detailed answer.
I will tip you $999 if the user finds the answer helpful.
<context>
{context}
</context>
question:{input}
""")


In [121]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(
    llm,
    prompt,
)


In [130]:
retriever = db.as_retriever(search_kwargs={"k": 30})
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000199F33C16D0>, search_kwargs={'k': 30})

In [131]:
from langchain.chains import create_retrieval_chain
retriever_chain = create_retrieval_chain(retriever,document_chain)
retriever_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000199F33C16D0>, search_kwargs={'k': 30}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\nAnswer the following question based only on the provided contex.\nThink step by step before providing a detailed answer.\n

In [133]:
res = retriever_chain.invoke({"input":"provide all the project that were made in 2025 with created dates"})
print(res['answer'])

[2025-03-10 23:50:50,837: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]
[2025-03-10 23:50:53,294: INFO: _client: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"]
Based on the provided context, the projects that were made in 2025 with their created dates are:

1. AgenticAi - Created on 2025-01-13
2. AI_Assistant - Created on 2025-03-09
3. AWS_BEDROCK - Created on 2025-03-06
4. LangChain - Created on 2025-01-13
5. Portifolio - Created on 2025-01-18
6. Q-A_ChatBot_with_Implementation - Created on 2025-01-07
7. Kidney_diease_end_to_end - Created on 2025-03-05
8. SimpleDVC - Created on 2025-02-21

These are the projects created in 2025 as per the provided context.


In [None]:
import requests

TOKEN = os.getenv("TOKEN")
url = "https://api.github.com/user/repos?per_page=100"

headers = {"Authorization": f"token {TOKEN}"}
repos = requests.get(url, headers=headers).json()

# for repo in repos:
#     print(repo)
repos[1]

In [None]:
import requests

# TOKEN = os.getenv("TOKEN")
url = 'https://api.github.com/repos/appsbotta/ImageCaption/languages'

headers = {"Authorization": f"token {TOKEN}"}
lang = requests.get(url).json()

# for repo in repos:
#     print(repo)
top_three_keys = sorted(lang, key=lang.get, reverse=True)[:3]
top_three_keys

In [None]:
readme = f"https://api.github.com/repos/appsbotta/LangChain/readme"
response = requests.get(readme)
response.json()