# Best-of Utility Notebook <a class="tocSkip">

**In this notebook:**

- Extract Github projects from a file.
- Check & improve existing best-of list.

_Please use a Python 3 kernel for the notebook_

## Dependencies

### Install Dependencies

In [None]:
# Nothing to install

### Import Dependencies

In [None]:
%load_ext autoreload
%autoreload 2

# System libraries
import logging, os, sys
import re, shutil
import subprocess
import time
import yaml
import glob, os
import json
from collections import Counter, OrderedDict
from datetime import datetime
from typing import Tuple

# Enable logging
logging.basicConfig(
    format="[%(levelname)s] %(message)s", level=logging.INFO, stream=sys.stdout
)

# logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.DEBUG, stream=sys.stdout)
log = logging.getLogger()

import pandas as pd
import yaml
from addict import Dict
import qgrid

from best_of import utils, yaml_generation
from best_of.integrations import libio_integration
from best_of.generator import parse_projects_yaml

os.environ["LIBRARIES_API_KEY"] = "<LIBIO_API_KEY>"
os.environ["GITHUB_API_KEY"] = "<GITHUB_API_KEY>"

## Extract projects from file

### Load best-of history files for exclusion

Select all history files from which the projects will be excluded for extraction below.

In [None]:
# CHANGE: Add all relevant history files here:
BEST_OF_LISTS = ["<HISTORY-FILE-PATH>"]

# Combine all best-of-lists into one based on github_id
github_projects = {}
for project_csv_path in BEST_OF_LISTS:
    df_projects = pd.read_csv(project_csv_path, index_col=0, keep_default_na=False)
    for i, row in df_projects.iterrows():
        if "github_id" in row and row["github_id"]:
            github_id = utils.simplify_str(row["github_id"])
            if github_id not in github_projects:
                github_projects[github_id] = row.to_dict()

df_projects = pd.DataFrame(github_projects.values())

# Show loaded projects
qgrid.show_grid(
    df_projects[["github_id", "github_url", "projectrank", "license", "star_count"]]
)

### Option 1: Extract projects via Github URL matching

In [None]:
# CHANGE: Select a file via URL, file-path, or provide the string content here:
EXTRACT_FILE = "<URL-or-TEXT-or-FILE>"

# Run github link extraction on the provided file
extracted_projects = yaml_generation.extract_github_projects(
    EXTRACT_FILE,
    df_projects["github_id"].to_list(),
)

In [None]:
# Show extracted projects
qgrid.show_grid(
    pd.DataFrame(extracted_projects)[
        ["github_id", "projectrank", "license", "star_count"]
    ]
)

### Option 2: Extract projects from requirements.txt

In [None]:
# LIBRARIES_API_KEY should be set!
# os.environ["LIBRARIES_API_KEY"] = "<LIBRARIES-TOKEN>"

# CHANGE: Select a file via URL, file-path, or provide the string content here:
EXTRACT_FILE = "<URL-or-TEXT-or-FILE>"

# Extract projects
extracted_projects = yaml_generation.extract_pypi_projects_from_requirements(
    EXTRACT_FILE, df_projects["pypi_id"].to_list()
)

In [None]:
# Show extracted projects
qgrid.show_grid(
    pd.DataFrame(extracted_projects)[
        ["github_id", "pypi_id", "projectrank", "license", "star_count"]
    ]
)

### Auto discover project packages

In [None]:
# CHANGE: Activate the package managers that are relevant for the given best-of-list
updated_projects = yaml_generation.auto_extend_package_manager(
    extracted_projects, pypi=True, conda=True
)

In [None]:
# Add column to select projects
df_updated_projects = pd.DataFrame(updated_projects)
df_updated_projects["selected"] = False

# Select columns to show
selected_colum = [
    "selected",
    "name",
    "description",
    "github_id",
    "projectrank",
    "license",
    "star_count",
    "monthly_downloads",
]

# Add all available package managers
from best_of.integrations import AVAILABLE_PACKAGE_MANAGER

package_columns = list(
    set(list(df_updated_projects.columns.values))
    & set([package_manager + "_id" for package_manager in AVAILABLE_PACKAGE_MANAGER])
)
selected_colum.extend(package_columns)

# Show updated projects
qgrid_pdated_projects = qgrid.show_grid(df_updated_projects[selected_colum])
qgrid_pdated_projects

### Show selected projects as YAML

Export all select projects (interactivly in qgrid table above) to yaml. This output can be easily added to a best-of `projects.yaml`.

In [None]:
# Get changed df from qgrid widget
df_selected_projects = qgrid_pdated_projects.get_changed_df()
# Filter all selected
df_selected_projects = df_selected_projects[df_selected_projects["selected"] == True]
df_selected_projects = df_selected_projects.where(
    pd.notnull(df_selected_projects), None
)

from best_of.integrations import AVAILABLE_PACKAGE_MANAGER

export_columns = [
    "name",
    "github_id",
]
export_columns.extend(package_columns)
selected_projects = df_selected_projects[export_columns].to_dict("records")

# To yaml format
print(yaml.dump(selected_projects, default_flow_style=False, sort_keys=False))

## Improve existing list

### Load best-of history csv

In [None]:
BEST_OF_LIST = "<HISTORY-FILE-PATH>"

date_columns = [
    "last_commit_pushed_at",
    "created_at",
    "updated_at",
    "latest_stable_release_published_at",
    "pypi_latest_release_published_at",
    "conda_latest_release_published_at",
    "dockerhub_latest_release_published_at",
    "npm_latest_release_published_at",
    "maven_latest_release_published_at",
]

df_projects = pd.read_csv(
    BEST_OF_LIST,
    index_col=0,
    infer_datetime_format=True,
    parse_dates=date_columns,
)

df_projects = df_projects.where(pd.notnull(df_projects), None)

selected_colum = [
    "name",
    "github_id",
]
# Add all available package managers
from best_of.integrations import AVAILABLE_PACKAGE_MANAGER

package_columns = list(
    set(list(df_updated_projects.columns.values))
    & set([package_manager + "_id" for package_manager in AVAILABLE_PACKAGE_MANAGER])
)
selected_colum.extend(package_columns)

projects = df_projects[selected_colum].to_dict("records")

### Run auto extend functionality

At the moment, the detected packages need to be added manually to the `projects.yaml` file.

In [None]:
updated_projects = yaml_generation.auto_extend_package_manager(
    projects, pypi=True, conda=True
)