# Preliminaries

In [None]:
import json
import logging
import os
import re
import shutil
from collections import defaultdict
from pathlib import Path
from typing import Optional

import pandas as pd
import requests
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from dotenv import load_dotenv
from rich.logging import RichHandler

from src.github_agent import GithubAgent
from osa_tool.models.models import ModelHandlerFactory
from src.settings import ConfigLoader, GitSettings
from src.deepeval_checker import CustomLLM
from osa_tool.utils import osa_project_root
from struct_to_json import build_tree, tree_to_dict

In [None]:
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "<YOUR_TOKEN>")


for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
    datefmt="[%X]",
    handlers=[RichHandler()],
)

logger = logging.getLogger("rich")

In [None]:
def parse_github_url(repo_url: str):
    pattern = r"https://github\.com/([^/]+)/([^/]+)"
    match = re.match(pattern, repo_url)
    if match:
        return match
    else:
        logger.error(f"URL {repo_url} does not match expected format.")

# Preprocess benchmark

In [None]:

readme_generator_api = "openai" # vsegpt
readme_generator_url = "https://openrouter.ai/api/v1" 
# "https://api.openai.com/v1"
# "https://api.openai.com/v1" # "https://api.openai.com/v1"  # gpt-3.5-turbo, openai/gpt-3.5-turbo
readme_generator_model_name = 'gpt-4.1'
# "google/gemma-3-27b-it" 
# "gpt-4.1" "google/gemini-2.5-flash" # "deepseek/deepseek-chat-v3-0324" # "anthropic/claude-sonnet-4" #"gpt-4.1" #"anthropic/claude-3.7-sonnet"

# Model to assess readme quality
readme_assess_model_name = "gpt-4.1"
readme_assess_api = "openai"
readme_assess_url = "https://openrouter.ai/api/v1" 

repo_name_to_url = dict()
df = pd.read_csv("repos_upd.csv")
dataset_dir = Path(f"readme_datasets_unlimited_{readme_generator_model_name.split('/')[-1]}")
dataset_dir.mkdir(exist_ok=True, parents=True)
git_repo = []

In [None]:

for i, row in df.iterrows():
    repo_url, repo_name_, commit, _ = row
    repo_name = repo_name_.split('/')[1]
    print(f'{repo_url} {repo_name} {commit}')
    params = parse_github_url(repo_url)
    if params:
        user, repo = params.groups()
    else:
        continue
    full_url = f"https://api.github.com/repos/{user}/{repo}/git/trees/{commit}?recursive=1"
    # repository_url = repo_url.split("/tree/")[0]
    repository_url = repo_url

    repo_name_to_url[repo_name] = repository_url
    try:
        if not Path(dataset_dir, f"{repo_name}_original_README.md").exists():
            # ORIG readme
            github_agent = GithubAgent(repository_url)
            github_agent.clone_repository()
            if Path(repo_name, "readme.md").exists():
                orig_readme = Path(repo_name, "readme.md")
            else:
                orig_readme = Path(repo_name, "README.md")
            shutil.move(
                orig_readme, Path(dataset_dir, f"{repo_name}_original_README.md")
            )
         # Generate structure json file
        if not Path(dataset_dir, f"{repo_name}_struct.json").exists():
            headers = {"Accept": "application/vnd.github.v3+json"}
            if GITHUB_TOKEN:
                headers["Authorization"] = f"token {GITHUB_TOKEN}"
            try:
                r = requests.get(full_url, headers=headers)
                if r.status_code == 200:
                    with open(
                        f"{dataset_dir}/{repo_name}_struct.json",
                        "w",
                        encoding="utf-8",
                    ) as f:
                        f.write(r.text)
                        logger.info(
                            f"{dataset_dir}/{repo_name}_struct.json saved successfully"
                        )
                    with open(
                        f"{dataset_dir}/{repo_name}_struct.json",
                        "r",
                        encoding="utf-8",
                    ) as f:
                        f = json.load(f)
                    os.remove(f"{dataset_dir}/{repo_name}_struct.json")
                    stop_words = [
                        "assets",
                        "results",
                        "sources",
                        "packages",
                        "images",
                        "data",
                    ]
                    paths = [
                        entry["path"]
                        for entry in f.get("tree", [])
                        if not any(
                            f"/{stop}/" in f"/{entry['path']}/"
                            or entry["path"].startswith(f"{stop}/")
                            for stop in stop_words
                        )
                    ]
                    tree = build_tree(paths)
                    struct = tree_to_dict(tree)
                    with open(
                        f"{dataset_dir}/{repo_name}_struct.json",
                        "w",
                        encoding="utf-8",
                    ) as f:
                        json.dump(struct, f, indent=4, ensure_ascii=False)
                else:
                    logger.info(f"[{r.status_code}] Error for {full_url}")
            except Exception as e:
                logger.error(f"Request failed for {full_url}: {e}")

    except Exception as e:
        logger.error(
            f"Error {e} occured during readme generation for {repo_name}"
        )


# Evaluation

### Prompts

In [None]:
OPENAI_API_KEY = ""
OPENROUTER_API_KEY = ""
HF_TOKEN = ""

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# os.environ["OPENAI_API_KEY"] = OPENROUTER_API_KEY

In [None]:
from src.settings import ArticleConfigLoader


def load_configuration(
    repo_url: str, api: str, model_name: str, url: str, article: Optional[str]
) -> ConfigLoader:
    """
    Loads configuration for osa_tool.

    Args:
        repo_url (str): URL of the GitHub repository.
        api (str): LLM API service provider.
        model_name (str): Specific LLM model to use.
        article (Optional[str]): Link to the pdf file of the article. Can be None.

    Returns:
        config_loader: The configuration object which contains settings for osa_tool.
    """
    if article is None:
        config_loader = ConfigLoader(
            config_dir=os.path.join(
                osa_project_root(), "osa_tool", "config", "standart"
            )
        )
    else:
        config_loader = ArticleConfigLoader(
            config_dir=os.path.join(
                osa_project_root(), "osa_tool", "config", "with_article"
            )
        )

    config_loader.config.git = GitSettings(repository=repo_url)
    config_loader.config.llm = config_loader.config.llm.model_copy(
        update={"api": api, "model": model_name, "url": url}
    )
    logger.info("Config successfully updated and loaded")
    return config_loader


In [None]:

prompt = (
'''
Determine whether the AI-generated Readme file (ACTUAL_OUTPUT)
is better than the original one (EXPECTED_OUTPUT).
ACTUAL_OUTPUT contains two fields: ’readme’, which contains the generated README itself,
and ’repo_structure’ which is json with repository’s structure.
Generated README’s content must be consistent with the provided repository structure.
The ACTUAL_OUTPUT does not necessary have to be the same as EXPECTED_OUTPUT,
Your goal is to determine which text is better, using the provided Evaluations steps.
Readme structure does not matter much as long as it passes the evaluation steps.
'''
)
config_loader = ConfigLoader(config_dir=os.path.join(
                osa_project_root(), "osa_tool", "config", "standart"
            ))
config_loader.config.llm = config_loader.config.llm.model_copy(update={"api": readme_generator_api, "model": readme_generator_model_name, "url": readme_generator_url})
baseline_readme_generator = ModelHandlerFactory.build(config_loader.config)

readme_assess_model = CustomLLM(readme_assess_api, readme_assess_model_name, readme_assess_url)
metrics_init_params = {
    "model": readme_assess_model,
    "verbose_mode": False,
    "async_mode": False,
}
readme_correctness_metric = GEval(
    name="Readme quality",
    criteria=prompt, #"Determine quality of AI-generated README file by comparing it to the human-written one",
    evaluation_steps=[
        "Step 1: Does the provided structure of the repository address README content?",
        "Step 2: Does the README provide a clear and accurate overview of the repository’s purpose?",
        "Step 3: Are installation and setup instructions included and easy to follow?",
        "Step 4: Are usage examples provided and do they clearly demonstrate functionality?",
        "Step 5: Are dependencies or requirements listed appropriately?",
        "Step 6: Is the README easy to read, well-structured, and free of confusing language?",
    ],
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
    **metrics_init_params,
)


In [None]:
MODELS_FOR_GENERATION = ['gpt-4.1', 'anthropic/claude-sonnet-4', 'google/gemma-3-27b-it']


In [None]:
res = defaultdict(list)

for model in MODELS_FOR_GENERATION:
    model_nm = model
    model_nm = model.split('/')[1] if '/' in model else model
    dataset_dir = f"readme_datasets_unlimited_{model_nm}"
    

    for i, row in df.iterrows():
        repo_url, repo_name_, commit, _ = row
        repo_path = repo_name_.split('/')[1]
        print(f'{repo_url} {repo_path} {commit}')
        res['model'].append(model)
        res['repo_path'].append(repo_path)

        output_readmeready = f"{dataset_dir}/{repo_path}__readmeready-{model_nm}.md"
        output_larch = f"{dataset_dir}/{repo_path}__larch-{model_nm}.md"

        # try:
        if os.path.exists(output_readmeready):
            print(f"Skipping.. {repo_path}")
           
            res['readmeready'].append(True)
            # continue
        else:
            print(f"Not done.. {repo_path}")
            res['readmeready'].append(False)
        
        if os.path.exists(output_larch):
            print(f"Skipping.. {repo_path}")

            res['larch'].append(True)
            # continue
        else:
            print(f"Not done.. {repo_path}")
            res['larch'].append(False)
        # res['readmeready'][model][repo_path] = 'done'

        # LARCH
        # try:
        # except:
            # res['larch'][model][repo_path] = 'error'

In [None]:
r_df = pd.DataFrame.from_dict(res)
r_df.model.unique()

In [None]:
r_df[r_df['model'] == 'google/gemma-3-27b-it'].sum()

In [None]:
repo_urls = [
    "https://github.com/AntonOsika/gpt-engineer",
    "https://github.com/THUDM/ChatGLM-6B",
    "https://github.com/OpenEthan/SMSBoom",
    "https://github.com/lra/mackup",
    "https://github.com/chenfei-wu/TaskMatrix",
    "https://github.com/hacksider/Deep-Live-Cam",
    "https://github.com/google/python-fire",
    "https://github.com/stitionai/devika",
    "https://github.com/Pythagora-io/gpt-pilot",
    "https://github.com/CorentinJ/Real-Time-Voice-Cloning",
    "https://github.com/encode/httpx",
    "https://github.com/lss233/kirara-ai",
    "https://github.com/assafelovic/gpt-researcher",
    "https://github.com/mkdocs/mkdocs",
    "https://github.com/ageitgey/face_recognition",
    "https://github.com/donnemartin/system-design-primer",
    "https://github.com/chatanywhere/GPT_API_free",
    "https://github.com/Asabeneh/30-Days-Of-Python",
    "https://github.com/kaixindelele/ChatPaper",
    "https://github.com/twintproject/twint",
    "https://github.com/pallets/flask",
    "https://github.com/charlax/professional-programming",
    "https://github.com/ethereum/EIPs",
    "https://github.com/xai-org/grok-1",
    "https://github.com/eriklindernoren/PyTorch-GAN",
    "https://github.com/public-apis/public-apis",
    "https://github.com/Z4nzu/hackingtool",
    "https://github.com/gto76/python-cheatsheet",
    "https://github.com/danielgatis/rembg",
    "https://github.com/bregman-arie/devops-exercises",
    "https://github.com/Vision-CAIR/MiniGPT-4",
    "https://github.com/Jack-Cherish/python-spider",
    "https://github.com/openai/chatgpt-retrieval-plugin",
    "https://github.com/black-forest-labs/flux",
    "https://github.com/sb-ai-lab/EmotiEffLib",
    "https://github.com/sb-ai-lab/Ride",
    "https://github.com/hse-cs/delPezzo",
    "https://github.com/deeppavlov/chatsky",
    "https://github.com/deeppavlov/dialog2graph",
    "https://github.com/corl-team/verl-loras",
    "https://github.com/sb-ai-lab/Eco2AI",
    "https://github.com/AIRI-Institute/GENA_LM",
    "https://github.com/AIRI-Institute/eco4cast",
    "https://github.com/Vishnu-tppr/Camouflage-AI",
    "https://github.com/tbhvishal/Python-Weather-Info-App",
    "https://github.com/readytensor/rt-repo-assessment",
    "https://github.com/DadaNanjesha/TagGenerator", # renamed
    "https://github.com/stephenombuya/Code-Contribution-Analyzer",
]


def generate_latex_list(items, ordered=False, nested_level=0):
    """
    Generates LaTeX list code from a list of items.
    Items can be strings or lists of strings for nesting.
    """
    indent = "  " * nested_level
    list_type = "enumerate" if ordered else "itemize"
    latex_code = f"{indent}\\begin{{{list_type}}}\n"

    for item in items:
        if isinstance(item, list):
            # Recursively generate nested list
            latex_code += generate_latex_list(item, ordered, nested_level + 1)
        else:
            latex_code += f"{indent}  \\item {item}\n"

    latex_code += f"{indent}\\end{{{list_type}}}\n"
    return latex_code

print(generate_latex_list(repo_urls))

In [None]:
r_df[r_df['readmeready'] == False].groupby(['repo_path', 'model']).sum()

# README Generation

In [None]:
MODELS_FOR_GENERATION = ['gpt-4.1', 'anthropic/claude-sonnet-4', 'google/gemma-3-27b-it']


In [None]:
from src.readme_generator import ReadmeGenerator
from huggingface_hub import login
import asyncio

results = defaultdict(list)
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# os.environ["OPENAI_API_KEY"] = OPENROUTER_API_KEY

os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["OPENAI_API_BASE"] = 'https://openrouter.ai/api/v1'

login(token=HF_TOKEN, add_to_git_credential=False)
# model = 'gpt-4.1'

async def generate_readmes():
    rr_done = 0
    l_done = 0
    for model in MODELS_FOR_GENERATION:
        model_nm = model
        model_nm = model.split('/')[1] if '/' in model else model
        dataset_dir = f"readme_datasets_unlimited_{model_nm}"
        

        for i, row in df.iterrows():
            repo_url, repo_name_, commit, _ = row
            repo_path = repo_name_.split('/')[1]
            print(f'{repo_url} {repo_path} {commit}')
            

            output_readmeready = f"{dataset_dir}/{repo_path}__readmeready-{model_nm}.md"
            output_larch = f"{dataset_dir}/{repo_path}__larch-{model_nm}.md"

            try:
                if os.path.exists(output_readmeready):
                    print(f"Skipping.. {repo_path}")
                    rr_done += 1
                    # res['readmeready'].append(True)
                    continue
                else:
                    print(f"Not done.. {repo_path}")
                    # res['readmeready'].append(False)

                ready_gen = ReadmeGenerator(
                    tool="readmeready",
                    model=model,
                    output_path=output_readmeready,
                    repo_path=repo_path,
                    repo_url=repo_url,
                    api_key=OPENAI_API_KEY,
                )
                await asyncio.wait_for(ready_gen.generate(), 600.0)
                rr_done += 1

                # res['readmeready'][model][repo_path] = 'done'
                # shutil.rmtree(f'{repo_path}/output_tmp')
            except Exception as e:
                print(str(e))
                continue
                # res['readmeready'][model][repo_path] = 'error'


            # LARCH
            try:
                if os.path.exists(output_larch):
                    print(f"Skipping.. {repo_path}")
                    l_done += 1
                    # res['readmeready'].append(True)
                    continue
                else:
                    print(f"Not done.. {repo_path}")
                    # res['readmeready'].append(False)

                larch_gen = ReadmeGenerator(
                    tool="larch",
                    model=model, 
                    output_path=output_larch,
                    repo_path=repo_path,
                    api_key=OPENAI_API_KEY,
                )
                larch_gen.generate()
                l_done += 1

                # res['larch'][model][repo_path] = 'done'
            except:
                # res['larch'][model][repo_path] = 'error'
                continue



await generate_readmes()

### Assessment

In [None]:
from datetime import datetime

In [None]:
r_in = (pd.read_csv(f'final_eval_res_2025-11-22_{model_nm}.csv'))

done = list(r_in[['model', 'proj', 'repo']].itertuples(index=False, name=None))

In [None]:
# Readme assess
from datetime import datetime
from collections import defaultdict
dt = datetime.now()

larch = 0
final = None
results = defaultdict(list)

for model_nm in ['gpt-4.1', 'gemma-3-27b-it', 'claude-sonnet-4']:
    dataset_dir = Path(f"readme_datasets_unlimited_{model_nm}")
    l = len(list(dataset_dir.glob("*.md")))
    print(f'{model_nm} len: {l}')
    for repo_readme in dataset_dir.glob("*.md"):
        if "readmeready" in str(repo_readme) or "larch" in str(repo_readme):
            continue
        repo_name = str(repo_readme).split("_original_README.md")[0].split('/')[1]
        # if repo_name != 'TagGenerator':
        #     continue
        # model = readme_generator_model_name
        print(repo_name)
        readme_gen = {}
        struct_pth = os.path.join(dataset_dir, f"{repo_name}_struct.json")
        original_readme_pth = os.path.join(dataset_dir, repo_name + "_original_README.md")
        readme_larch_pth = os.path.join(dataset_dir, f'{repo_name}__larch-{model_nm}.md')
        readme_ready_pth = os.path.join(dataset_dir, f'{repo_name}__readmeready-{model_nm}.md')

        
       
        original_readme = open(
            str(original_readme_pth), "r", encoding="utf8"
        ).read()
        repo_struct = open(str(struct_pth), "r", encoding="utf8").read()

        # for proj in ['readme_ready', 'larch']:
        proj = 'readmeready'

        if (model_nm, proj, repo_name) in done:
            print(f'{model_nm} {proj} {repo_name} Already done, skipping')
            continue
        print(f'{model_nm} {proj} {repo_name} processing..')

            # if (model_nm + proj + repo_name) in list(map(lambda x: x[2] + x[3] + x[5],r.to_records())):
            #     print(f"Already done {model_nm + proj + repo_name}, skipping..")
            #     if proj == 'larch':
            #         larch += 1
            #         print(larch)
            #     continue
            # print(f'M: {model_nm + proj + repo_name}')
            # print(f'LIST: {list(map(lambda x: x[2] + x[3] + x[5],rrr.to_records()))}')
            # raise StopIteration()
        # try:
        #     if proj == 'readme_ready':
        #         readme_gen[proj] = open(os.path.join(dataset_dir, f'{repo_name}__readmeready-{model_nm}.md'), "r", encoding="utf8").read()
        #     else:
        #         readme_gen[proj] = open(os.path.join(dataset_dir, f'{repo_name}__larch-{model_nm}.md'), "r", encoding="utf8").read()
        # except:
        #     print(f'Some error, {proj} {repo_name} {model_nm}')
        #     continue
        try:
            readme_gen[proj] = open(os.path.join(dataset_dir, f'{repo_name}__{proj}-{model_nm}.md'), "r", encoding="utf8").read()

            prompt = {"readme": readme_gen[proj], "repo_structure": repo_struct}

            test_case = LLMTestCase(
                input=(
                    "Evaluate the AI-generated README file by comparing it to the human-written one"
                ),
                actual_output=json.dumps(prompt),
                expected_output=original_readme,
            )


            try:
                print(f"\n\n{'=' * 12} {proj.upper()} METRICS{'=' * 12}")
                score = readme_correctness_metric.measure(test_case)
                print(score)
            except Exception as e:
                print(e)
            # score = 0


            results["model"].append(model_nm)
            results["proj"].append(proj)
            results["score"].append(score)
            results["repo"].append(repo_name)
        except Exception as e:
            print(e)

            # print(f"Model name: {model_nm};\n{proj}\nScore: {score}")
    # df = pd.DataFrame.from_dict(results)
    # print(f"Generation model: {readme_generator_model_name}; Checking model: {readme_assess_model_name}")
    # print(f"OSA mean: {df['score'].mean()}; LLM mean: {df['test_case_gpt'].mean()}")
    # df.to_csv(f"TESTING_readme_results_gpt_osa_prompt_extended_dataset__{readme_generator_model_name.split('/')[-1]}.csv")
    # print()

    eval_res = pd.DataFrame.from_dict(results)
    final = eval_res
    eval_res.to_csv(f'eval_res_2025-11-22_{model_nm}_{dt}.csv')


In [None]:
final_ = pd.concat([r_in, final])
final_.to_csv(f'final_eval_res_2025-11-22_{model_nm}.csv')
final_

In [None]:
r = final_

In [None]:
# eval_res = pd.DataFrame.from_dict(results)
# eval_res.to_csv(f'eval_res_2025-11-22_{readme_generator_model_name}.csv')

In [None]:
COMMON_REPOS = [
    'labelme',
'gpt-engineer',
'ChatGLM-6B',
'SMSBoom',
'mackup',
'TaskMatrix',
'Deep-Live-Cam',
'python-fire',
'devika',
'gpt-pilot',
'Real-Time-Voice-Cloning',
'httpx',
'kirara-ai',
'gpt-researcher',
'mkdocs',
'face_recognition',
'system-design-primer',
'GPT_API_free',
'30-Days-Of-Python',
'ChatPaper',
'twint',
'flask',
'professional-programming',
'EIPs',
'grok-1',
'PyTorch-GAN',
'public-apis',
'hackingtool',
'python-cheatsheet',
'rembg',
'devops-exercises',
'MiniGPT-4',
'python-spider',
'chatgpt-retrieval-plugin',
'flux'
]

In [None]:
r['repo_type'] = r['repo'].apply(
    lambda r: 'Common' if r in COMMON_REPOS else 'Scientific'
)

In [None]:
r.groupby(['model', 'proj', 'repo_type']).agg(
    {
        'score': 'mean',
        'proj': 'count',
    }
).rename({
    'score': 'mean_score', 'proj': 'count_repos'
}, axis='columns')

In [None]:
r.groupby('repo').count().sort_values(by='score')