# Practical - Open Source Article Validation

In this notebook, we want to determine whether an article from a popular journal adheres to best practices around software engineering. We will be loosely applying some of the criteria around the 

In [1]:
from llama_index.core import VectorStoreIndex
from llama_index.readers.github import GithubRepositoryReader, GithubClient

from IPython.display import Markdown, display
import os

In [4]:
import nest_asyncio

nest_asyncio.apply()

In [51]:
import dotenv
import os

dotenv.load_dotenv()

github_token = os.environ.get("GITHUB_TOKEN")
owner = "rkdan"
repo = "small_world_propensity"
branch = "main"

In [52]:
github_client = GithubClient(github_token=github_token, verbose=True)
from llama_index.embeddings.openai import OpenAIEmbedding

In [87]:
documents = GithubRepositoryReader(
    github_client=github_client,
    owner=owner,
    repo=repo,
    use_parser=True,
    verbose=False,
    filter_file_extensions=(
        [
            ".lock",
            ".mat",
            ".mtx"
        ],
        GithubRepositoryReader.FilterType.EXCLUDE,
    ),
).load_data(branch=branch)

In [116]:
documents

[Document(id_='34ae710a53440d7a200be4cc65b252d73afa54e9', embedding=None, metadata={'file_path': '.github/scripts/update_version_and_changelog.sh', 'file_name': 'update_version_and_changelog.sh', 'url': 'https://github.com/rkdan/small_world_propensity/blob/main/.github/scripts/update_version_and_changelog.sh'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='#!/bin/bash\n\n# Increment version in pyproject.toml and __init__.py\n# This is a simplistic approach; you may need a more sophisticated versioning logic.\npoetry version patch\n\n# Extract the new version number\nNEW_VERSION=$(poetry version -s)\n\n# Update __init__.py with the new version\nsed -i "s/__version__ = .*/__version__ = \'$NEW_VERSION\'/" small_world_propensity/__init__.py\n\n# Update CHANGELOG.md\n# This is a placeholder. Consider using a tool or script to generate meaningful changelog entries.\necho -e "## $NEW_VERSION\\n* Your changes here\\n\\n$(cat CHANGELOG.md)" > CHANGELOG.

In [89]:
embedding = OpenAIEmbedding(
    api_key=os.environ.get("OPENAI_API_KEY"),
    model="text-embedding-3-small",
)

index = VectorStoreIndex.from_documents(documents, embed_model=embedding)

In [111]:
retriever = index.as_retriever(
    similarity_top_k=20
)
retrieved_nodes = retriever.retrieve("Does this code contain any tests?")

def

In [115]:
len(retrieved_nodes)

20

In [105]:
display(Markdown(f"<b>{response}</b>"))

<b>The provided context information describes a GitHub repository named "small_world_propensity" which contains code related to calculating the small-world propensity of a weighted, undirected network. The repository includes files such as Python scripts, MATLAB scripts, and configuration files for tools like Poetry. It also contains information on how to install the package, generate regular and random matrices, compare network properties, and cite the work.</b>

In [108]:
query_engine = index.as_query_engine(similarity_top_k=20)
response = query_engine.query(
    "Does this code contain any tests?",
)

display(Markdown(f"<b>{response}</b>"))

<b>No</b>

In [113]:
query_engine = index.as_query_engine(similarity_top_k=10)
response = query_engine.query(
    "Does this repo have a licence? If so, which kind?",
)

display(Markdown(f"<b>{response}</b>"))

<b>This repo has a license. The license used in this repository is the GNU Affero General Public License Version 3.</b>