In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import boto3
import aws_util
import cfg
import yaml
import ray
from pathlib import Path

import regroup
import ray_util
import util
import process_commit_pairs
import render

## Ray cluster management 

In [None]:
import ray_server
server = ray_server.get_ray_server()

In [None]:
server.scale_cluster(60)

## Load secrets

In [None]:
with open('secrets.yaml') as f:
    secrets = yaml.safe_load(f)

## Get permissive licenses and convert them to sqlite

In [None]:
aws_util.download_s3._function(
    cfg.repo_licenses_s3['bucket'],
    [cfg.repo_licenses_s3['path']],
    cfg.repo_licenses_path,
    secrets['aws_access_key_id'],
    secrets['aws_secret_access_key']
)

In [None]:
util.repo_names_licenses_convert_to_sqlite(
    cfg.repo_licenses_path / Path(cfg.repo_licenses_s3['path']).name,
    cfg.repo_licenses_sqlite_file
)

## Download commit pairs

In [None]:
file_list = aws_util.list_parquet_files_s3(
    cfg.commit_paris_files_s3['bucket'],
    cfg.commit_paris_files_s3['path'],
    secrets['aws_access_key_id'],
    secrets['aws_secret_access_key']
)

In [None]:
file_groups = regroup.split_items(file_list, 10, False)

In [None]:
res = []
for gr in file_groups[1:]:
    res.append(aws_util.download_s3.remote(
        cfg.commit_paris_files_s3['bucket'],
        gr,
        cfg.pr_commit_pairs_files_path,
        secrets['aws_access_key_id'],
        secrets['aws_secret_access_key']
    ))

In [None]:
ray_util.ray_tasks_progress(res)
res = ray.get(res)

## Get opt outs

In [None]:
repos_opt_out, users_for_repo_opt_out, users_for_commits_opt_out, users_for_issues_opt_out = util.get_opt_outs(
    src=cfg.opt_outs_dataset_name,
    token=secrets['hf_api_key']
)

## Filter opt outs and non permissive licenses, get range of changed for each file pair, remove new or deleted files 

In [None]:
files = list(cfg.pr_commit_pairs_files_path.glob('*.parquet'))
dst = cfg.pr_commid_pairs_files_filtered_path
dst.mkdir(parents=True, exist_ok=True)
max_changes_length = render.RenderParams().max_pr_length

In [None]:
res = []
for f in files:
    res.append(process_commit_pairs.filter_nonpermissive_opt_outs_and_prepare_commit_pairs.remote(
        f, dst, repos_opt_out, users_for_repo_opt_out, max_changes_length, cfg.repo_licenses_sqlite_file
    ))

In [None]:
ray_util.ray_tasks_progress(res)
res = ray.get(res)

## Filter files by content

In [None]:
files = list(cfg.pr_commid_pairs_files_filtered_path.glob('*.parquet'))
dst  = cfg.pr_commid_pairs_files_filtered_cleaned_path
dst.mkdir(parents=True, exist_ok=True)

In [None]:
params = process_commit_pairs.FilterParams()
res = ray_util.ray_map(
    process_commit_pairs.clean_files_bucket,
    files,
    dst=dst,
    filter_params=params
)


In [None]:
ray_util.ray_tasks_progress(res)
res = ray.get(res)

In [None]:
res = regroup.ray_shuffle.remote(
    cfg.pr_commid_pairs_files_filtered_cleaned_path,
    cfg.pr_commid_pairs_files_filtered_cleaned_grouped_path,
    'pull_request.guid',
    3
)

In [None]:
res = ray.get(res)

## Ray cluster management

In [None]:
ray.shutdown()
server.scale_cluster(0)