In [25]:
import os

In [26]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant\\research'

In [27]:
os.chdir('../')

In [28]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

# Data Ingestion

In [117]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    github_url: str
    save_dir: Path

In [118]:
from bot.constants import *
from bot.utils.common import read_yaml,create_directories

In [119]:
type(CONFIG_PATH)

pathlib._local.WindowsPath

In [120]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_PATH,
            params_path = PARAMS_PATH
        ):
        self.config = read_yaml(config_path)
        self.param = read_yaml(params_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            github_url=config.github_url,
            save_dir=config.save_dir
        )

        return data_ingestion_config

In [121]:
import os
import requests
from bot import logger
from dotenv import load_dotenv
load_dotenv()
import base64
import markdown2

In [122]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config
    
    def get_repos(self):
        url = self.config.github_url
        headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
        repos = requests.get(url, headers=headers).json()
        save_dir = self.config.save_dir
        with open(os.path.join(save_dir,'repo.txt'),'w') as f:
            logger.info(f"Starting writing data from git hub repos")
            for repo in repos:
                OWNER = repo['owner']['login']
                REPO = repo['name']
                readme = f"https://api.github.com/repos/{OWNER}/{REPO}/readme"
                headers = {"Authorization": f"token {os.getenv("TOKEN")}"}
                response = requests.get(readme, headers=headers)

                readme_content = "No Readme FIle"
                if response.status_code == 200:
                    data = response.json()
                    readme_content = base64.b64decode(data["content"]).decode("utf-8")
                    readme_content =  markdown2.markdown(readme_content, extras=["strip"])
                
                f.write(str(repo["name"]) + " -> " + str(repo["html_url"] +"\n"))
                f.write("Description \n")
                f.write(readme_content)
                f.write(str("\n"))
                f.write(str("\n"))
                f.write("<=================================================================================================>")
                f.write(str("\n"))
                f.write(str("\n"))
            logger.info(f"Completed writing data from git hub repos")

In [123]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.get_repos()
except Exception as e:
    raise e

[2025-03-10 10:39:24,412: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-10 10:39:24,417: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-10 10:39:24,420: INFO: common: created directory at: artifacts]
[2025-03-10 10:39:24,423: INFO: common: created directory at: artifacts/data_ingestion]
[2025-03-10 10:39:25,104: INFO: 4043467873: Starting writing data from git hub repos]
[2025-03-10 10:39:36,116: INFO: 4043467873: Completed writing data from git hub repos]


In [None]:
import requests
import os
TOKEN = os.getenv("TOKEN")
url = "https://api.github.com/user/repos?per_page=100"

headers = {"Authorization": f"token {TOKEN}"}
repos = requests.get(url, headers=headers).json()

repos[0]

In [None]:
import requests
import base64

TOKEN = os.getenv("TOKEN")
OWNER = repos[0]['owner']['login']
REPO = repos[0]['name']#"DL-End_to_End"

readme = f"https://api.github.com/repos/{OWNER}/{REPO}/readme"
headers = {"Authorization": f"token {TOKEN}"}

response = requests.get(readme, headers=headers)

readme_content = "No Readme FIle"
if response.status_code == 200:
    data = response.json()
    readme_content = base64.b64decode(data["content"]).decode()
print(readme_content)  # Print or save to a file

No Readme FIle


# Data Transformation

In [14]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant\\research'

In [15]:
import os

In [16]:
os.chdir('../')

In [17]:
%pwd

'c:\\Users\\Appsb\\Desktop\\AI_Assistant'

In [19]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTrasformationConfig:
    root_dir: Path
    file_dir: Path
    save_dir: Path

In [20]:
from bot.constants import *
from bot.utils.common import read_yaml,create_directories

In [None]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_PATH,
            params_path = PARAMS_PATH
        ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self)->DataTrasformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_trasformation_config = DataTrasformationConfig(
            root_dir=config.root_dir,
            file_dir=config.file_dir,
            save_dir=config.save_dir,
        )

        return data_trasformation_config
    
    

In [26]:
import os
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Preformatted
from bot import logger

In [27]:
class DataTransformation:
    def __init__(self,config:DataTrasformationConfig):
        self.config = config
    
    def get_pdf_from_text(self):
        save_dir = self.config.save_dir
        file_path = Path(self.config.file_dir)
        doc = SimpleDocTemplate(os.path.join(save_dir,'repo.pdf'), pagesize=letter)
        styles = getSampleStyleSheet()
        custom_style = ParagraphStyle(
            'Custom',
            parent=styles['Normal'],
            fontName='Courier',
            fontSize=10,
            leading=14,  # Line height
            spaceAfter=5,
        )

        content = []

        with open(os.path.join(file_path,'repo.txt'), "r", encoding="utf-8") as f:
            lines = f.readlines()
            for line in lines:
                if line.strip() == "<=================================================================================================>":
                    paragraph = Preformatted(line.strip(), custom_style)
                else:
                    paragraph = Paragraph(line.strip(), custom_style)     # Wrap text automatically
                content.append(paragraph)
                content.append(Spacer(1, 5))  # Preserve spacing
        
        doc.build(content)
    

# txt_to_pdf('repo.txt', "output.pdf")
    
        

In [28]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_pdf_from_text()
except Exception as e:
    raise e

[2025-03-10 11:48:35,745: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-10 11:48:35,751: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-10 11:48:35,755: INFO: common: created directory at: artifacts]
[2025-03-10 11:48:35,758: INFO: common: created directory at: artifacts/data_transformation]
