# Best-of Update Utility <a class="tocSkip">

**In this notebook:**

- Collect and update the projects in the best-of list

_Please use a Python 3 kernel for the notebook_

## Dependencies

### Install Dependencies

In [None]:
!pip install --upgrade best-of

In [None]:
# Clone all best-of lists here

### Import Dependencies

In [None]:
%load_ext autoreload
%autoreload 2

# System libraries
import logging, os, sys
import re, shutil
import subprocess
import time
import collections
import yaml
import glob, os
import json
from tqdm import tqdm
from collections import Counter, OrderedDict
from datetime import datetime
from typing import Tuple, Optional, List

# Enable logging
logging.basicConfig(
    format="[%(levelname)s] %(message)s", level=logging.INFO, stream=sys.stdout
)

log = logging.getLogger()

import numpy as np
import pandas as pd
import yaml
from addict import Dict
import qgrid

from best_of import utils, yaml_generation
from best_of.integrations import libio_integration, github_integration
from best_of.generator import parse_projects_yaml

os.environ["LIBRARIES_API_KEY"] = "<LIBRARIES_API_KEY>"
os.environ["GITHUB_API_KEY"] = "<GITHUB_API_KEY>"

## Load Projects

In [None]:
extracted_projects = []

### Load best-of history files for exclusion

Select all history files from which the projects will be excluded for extraction below.

In [None]:
# CHANGE: Add all relevant history files here:
BEST_OF_LISTS = []

# Combine all best-of-lists into one based on github_id
github_projects = {}
for project_csv_path in BEST_OF_LISTS:
    df_projects = pd.read_csv(project_csv_path, index_col=0, keep_default_na=False)
    for i, row in df_projects.iterrows():
        if "github_id" in row and row["github_id"]:
            github_id = utils.simplify_str(row["github_id"])
            if github_id not in github_projects:
                github_projects[github_id] = row.to_dict()

df_existing_projects = pd.DataFrame(github_projects.values())

excluded_github_ids = []

if "github_id" in df_existing_projects:
    excluded_github_ids = df_existing_projects["github_id"].to_list()

In [None]:
# Show loaded projects
qgrid.show_grid(
    df_existing_projects[["github_id", "github_url", "projectrank", "license", "star_count"]]
)

### Optional: Load Projects from Github Org

In [None]:
# CHANGE: Map one or more Github Orgs to a group ID
GITHUB_ORG_MAPPING = {
    # "group_id": ["github_org_1", "github_org_2"]
}

#### Find duplicated usage of github orgs

In [None]:
UNIQUE_ORGS = set()
for group in GITHUB_ORG_MAPPING.keys():
    organizations = GITHUB_ORG_MAPPING[group]
    for org in organizations:
        org = org.lower()
        if org in UNIQUE_ORGS:
            print(f"Org {org} is duplicated with {group}")
        else:
            UNIQUE_ORGS.add(org)

#### Collect projects from orgs

In [None]:
for group in GITHUB_ORG_MAPPING.keys():
    organizations = GITHUB_ORG_MAPPING[group]
    for org in organizations:
        projects = yaml_generation.get_projects_from_org(org, min_stars=15)

        # Run github link extraction on the provided file
        extracted_projects = yaml_generation.collect_github_projects(
            projects,
            excluded_github_ids,
            existing_projects=extracted_projects,
            group=group.lower()
        )

#### Show extracted projects

In [None]:
qgrid.show_grid(
    pd.DataFrame(extracted_projects)[
        ["github_id", "projectrank", "license", "star_count", "group_id"]
    ]
)

### Optional: Extract projects via Github URL matching

In [None]:
TEXT_DATA = """
"""

In [None]:
# CHANGE: Select a file via URL, file-path, or provide the string content here:
EXTRACT_FROM_DATA = [
    TEXT_DATA,
    # add additional URLS or files
]

# Run github link extraction on the provided file
extracted_projects = yaml_generation.extract_github_projects(
    EXTRACT_FROM_DATA,
    excluded_github_ids=excluded_github_ids,
    existing_projects=extracted_projects,
)

In [None]:
# Show extracted projects
qgrid.show_grid(
    pd.DataFrame(extracted_projects)[
        ["github_id", "projectrank", "license", "star_count"]
    ]
)

### Optional: Extract pypi projects via URL matching 

In [None]:
TEXT_DATA = """

"""

In [None]:
# LIBRARIES_API_KEY should be set!
# os.environ["LIBRARIES_API_KEY"] = "<LIBRARIES-TOKEN>"

# CHANGE: Select a file via URL, file-path, or provide the string content here:
EXTRACT_FROM_DATA = [
    TEXT_DATA,
    # add additional URLS or files
]

# Extract projects
extracted_projects = yaml_generation.extract_pypi_projects(
    EXTRACT_FROM_DATA,
    excluded_github_ids,
    existing_projects=extracted_projects,
)

In [None]:
# Show extracted projects
qgrid.show_grid(
    pd.DataFrame(extracted_projects)[
        ["github_id", "pypi_id", "projectrank", "license", "star_count"]
    ]
)

### Optional: Extract projects from requirements.txt

In [None]:
TEXT_DATA = """
"""

In [None]:
# LIBRARIES_API_KEY should be set!
# os.environ["LIBRARIES_API_KEY"] = "<LIBRARIES-TOKEN>"

# CHANGE: Select a file via URL, file-path, or provide the string content here:
EXTRACT_FROM_DATA = [
    TEXT_DATA,
    # add additional URLS or files
]

# Extract projects
extracted_projects = yaml_generation.extract_pypi_projects_from_requirements(
    EXTRACT_FROM_DATA,
    df_projects["pypi_id"].to_list(),
    existing_projects=extracted_projects,
)

In [None]:
# Show extracted projects
qgrid.show_grid(
    pd.DataFrame(extracted_projects)[
        ["github_id", "pypi_id", "projectrank", "license", "star_count"]
    ]
)

## Update project metadata

In [None]:
updated_projects = extracted_projects

### Optional: Check & remove outdated packages (TODO)

In [None]:
# TODO

### Optional: Auto discover project packages via libraries.io

In [None]:
updated_projects = yaml_generation.auto_extend_via_libio(
    updated_projects, selected_package_manager=None
)

### Optional: Auto discover project packages via name match

In [None]:
# CHANGE: Activate the package managers that are relevant for the given best-of-list
updated_projects = yaml_generation.auto_extend_package_manager(
    updated_projects, pypi=False, conda=False, npm=True
)

## Finalize projects.yaml

### Sort out and prepare projects

In [None]:
# CHANGE values
MIN_STAR_COUNT = 30
MIN_UPDATE_DATE = "2020-01-01"

In [None]:
# Add column to select projects
df_updated_projects = pd.DataFrame(updated_projects)

# Filter out by date: Updated after selected date:
df_updated_projects = df_updated_projects[
    (df_updated_projects["updated_at"] > MIN_UPDATE_DATE)
]

# Filter out by stars
df_updated_projects = df_updated_projects[(df_updated_projects["star_count"] > MIN_STAR_COUNT)]

### Export selected columns

In [None]:
# Select columns to show
selected_column = [
    "name",
    "description",
    "github_id",
    "projectrank",
    "license",
    "star_count",
    "monthly_downloads",
    "updated_at",
    "group_id"
]

# Add all available package managers
from best_of.integrations import AVAILABLE_PACKAGE_MANAGER

package_columns = list(
    set(list(df_updated_projects.columns.values))
    & set(
        [package_manager.name + "_id" for package_manager in AVAILABLE_PACKAGE_MANAGER]
    )
)

selected_column.extend(package_columns)

if "group_id" not in df_updated_projects:
    df_updated_projects["group_id"] = None

if "monthly_downloads" not in df_updated_projects:
    df_updated_projects["monthly_downloads"] = None

### Optional: Fix duplicated names

In [None]:
unique_names = set()
projects = df_updated_projects.to_dict("records")
for project in projects:
    name = project["name"]
    github_id = project["github_id"]
    if name.lower() in unique_names:
        print(f"Duplicated name {name} -> replacing with {github_id}.")
        project["name"] = project["github_id"]
    else:
        unique_names.add(name.lower())

df_updated_projects = pd.DataFrame(projects)

### Show and manually filter data

In [None]:
# Show updated projects
qgrid_updated_projects = qgrid.show_grid(
    df_updated_projects[selected_column], show_toolbar=True
)
qgrid_updated_projects

### Set categories for selected projects

In [None]:
# Get changed df from qgrid widget
df_selected_projects = qgrid_updated_projects.get_changed_df()

df_selected_projects = df_selected_projects.where(
    pd.notnull(df_selected_projects), None
)

df_selected_projects["category"] = None
df_selected_projects["labels"] = None
df_selected_projects["note"] = None
len(df_selected_projects)

In [None]:
# Select columns to show
selected_columns = [
    "name",
    "description",
    "github_id",
    "category",
    "labels",
    "note",
    "star_count",
    "projectrank",
    "group_id",
]

selected_columns.extend(package_columns)

# Print all available categories
if "category" in df_existing_projects:
    print(df_existing_projects["category"].unique())

# Show updated projects
qgrid_finalized_projects = qgrid.show_grid(
    df_selected_projects[selected_columns], show_toolbar=True
)
qgrid_finalized_projects

### Export selected projects as YAML

Export all select projects (interactively in qgrid table above) to yaml. This output can be easily added to a best-of `projects.yaml`.

In [None]:
# Get changed df from qgrid widget
df_selected_projects = qgrid_finalized_projects.get_changed_df()

# Filter all without category
# TODO? df_selected_projects = df_selected_projects[df_selected_projects["category"] == None]

if 'projectrank' in df_selected_projects:
    df_selected_projects = df_selected_projects.drop(['projectrank'], axis=1)

# Replace empty value with nan
df_selected_projects.replace("", np.nan, inplace=True)
# Remove all null and nan values
df_selected_projects = df_selected_projects.where(
    pd.notnull(df_selected_projects), None
)

from best_of.integrations import AVAILABLE_PACKAGE_MANAGER

export_columns = ["name", "github_id", "category", "labels", "note", "group_id"]
export_columns.extend(package_columns)

selected_projects = df_selected_projects[export_columns].to_dict("records")

cleaned_projects = []
for project in selected_projects:
    if "labels" in project and project["labels"]:
        labels = project["labels"]
        if isinstance(project["labels"], str):
            labels = project["labels"].split(",")
            
        project["labels"] = str(
            [label.strip().lower() for label in labels]
        )

    cleaned_projects.append({k: v for k, v in project.items() if v is not None})

# Sort by category
cleaned_projects = sorted(
    cleaned_projects, key=lambda k: str(k["category"]) if "category" in k else "zzz"
)

# Print categories
if GITHUB_ORG_MAPPING:
    print("# Groups")
    print(
        yaml.dump(create_group_projects(GITHUB_ORG_MAPPING, cleaned_projects), default_flow_style=False, sort_keys=False)
        .replace("'[", "[")
        .replace("]'", "]")
        .replace("''", '"')
    )

# To yaml format
print("# Projects")
print(
    yaml.dump(cleaned_projects, default_flow_style=False, sort_keys=False)
    .replace("'[", "[")
    .replace("]'", "]")
    .replace("''", '"')
)

# Update projects.yaml

Update the projects of an existing projects.yaml without adding new projects.

## Load projects.yaml

In [None]:
# CHANGE: Add path to projects.yaml file
BEST_OF_LIST_PROJECTS_PATH = "../projects.yaml"

from best_of import generator

_, existing_projects, _, _ = generator.parse_projects_yaml(BEST_OF_LIST_PROJECTS_PATH)

# Show loaded projects
qgrid.show_grid(
    pd.DataFrame(existing_projects)
)

## Optional: Update Github IDs

In [None]:
def update_github_ids(projects: list) -> list:
    for project in tqdm(projects):
        if "github_id" not in project:
            continue
        
        updated_project = Dict(project)
        github_integration.update_via_github(updated_project)

        if updated_project.updated_github_id:
            # Apply updated github id:
            old_github_id = project["github_id"]
            print(f"Update Github ID: {old_github_id} -> {updated_project.updated_github_id}")
            project["github_id"] = updated_project.updated_github_id

update_github_ids(existing_projects)

## Optional: Check if all homepage links are working

In [None]:
import requests
def website_works(url: str) -> bool:
    try:
        response = requests.head(url)
        return response.status_code
        if response.status_code == 200:
            return True
        elif response.status_code == 429:
            time.sleep(100)
            return website_works(url)
        else:
            print(f"Status code: {response.status_code}")
            return False
    except Exception as ex:
        print(ex)
        return False


# https://stackoverflow.com/questions/16778435/python-check-if-website-exists
for project in tqdm(existing_projects):
    project = Dict(project)
    if project.homepage and not website_works(project.homepage):
        print(f"The hompage of project {project.name} does not work: {project.homepage}")
    if project.docs_url and not website_works(project.docs_url):
        print(f"The docs_url of project {project.name} does not work: {project.docs_url}")
    if project.demo_url and not website_works(project.demo_url):
        print(f"The demo_url of project {project.name} does not work: {project.demo_url}")

## Optional: Auto discover project packages via libraries.io

In [None]:
existing_projects = yaml_generation.auto_extend_via_libio(
    existing_projects, selected_package_manager=None
)

## Fix duplicated names

In [None]:
df_updated_projects = pd.DataFrame(existing_projects)

unique_names = set()
projects = df_updated_projects.to_dict("records")
for project in projects:
    name = project["name"]
    if "github_id" not in project:
        # Does not have a github_id -> cannot be fixed
        continue
    github_id = project["github_id"]
    if name.lower() in unique_names:
        print(f"Duplicated name {name} -> replacing with {github_id}.")
        project["name"] = project["github_id"]
    else:
        unique_names.add(name.lower())

df_updated_projects = pd.DataFrame(projects)

## Show and manually modify data

In [None]:
qgrid_select_projects = qgrid.show_grid(
    df_updated_projects, show_toolbar=True
)
qgrid_select_projects

## Export selected projects as YAML

In [None]:
# Get changed df from qgrid widget
df_selected_projects = qgrid_select_projects.get_changed_df()

if "projectrank" in df_selected_projects:
    df_selected_projects = df_selected_projects.drop(["projectrank"], axis=1)

# Replace empty value with nan
df_selected_projects.replace("", np.nan, inplace=True)
# Remove all null and nan values
df_selected_projects = df_selected_projects.where(
    pd.notnull(df_selected_projects), None
)

selected_projects = df_selected_projects.to_dict("records")

cleaned_projects = []
for project in selected_projects:
    if "labels" in project and project["labels"]:
        labels = project["labels"]
        if isinstance(project["labels"], str):
            labels = project["labels"].split(",")

        project["labels"] = str([label.strip().lower() for label in labels])

    cleaned_projects.append({k: v for k, v in project.items() if v is not None})

# Sort by category (TODO: do not sort?)
# cleaned_projects = sorted(
#     cleaned_projects, key=lambda k: str(k["category"]) if "category" in k else "zzz"
# )

print(
    yaml.dump(cleaned_projects, default_flow_style=False, sort_keys=False)
    .replace("'[", "[")
    .replace("]'", "]")
    .replace("''", '"')
)