[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14obfKc7xpyAfngQywnSJUKVnmu1avdfr?usp=sharing)

## Colab Setup

In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [None]:
%%capture
if IN_COLAB:
    
    #Remove not needed python versions to free space
    !rm -rf "/usr/local/lib/python2.7"
    !rm -rf "/usr/lib/python2.7"

    # Clone the repo.
    # !git clone ""

    # Change the working directory to the repo root.
    # %cd

    # Add the repo root to the Python path.
    # import sys, os
    # sys.path.append(os.getcwd())
    
    #Install packages not native to colab
    !pip install python-dotenv
    !pip install pycaret
    !pip install wandb
    !pip install shap
    !pip install prefect

    #Mount GDrive to access .env file
    from google.colab import drive
    drive.mount('/content/gdrive')

    #Load env file
    #NOTE: gdrive wont allow you to mount dotfiles
    from dotenv import load_dotenv
    load_dotenv("./gdrive/MyDrive/ODSC 2022/my_env_file")

## Accessory function for WANDB

In [None]:
import wandb
from pandas_profiling import ProfileReport

In [None]:
def add_convert_for_wandb(artifact, path, profile=True, sample=2500):
    
    artifact.add_dir(path, name="data")

    for file_name in os.listdir(path):
        if file_name.endswith(".csv"):
            path_to_file = os.path.join(path, file_name)
            tab_name = file_name.replace(".csv", "")
            print(f"adding {tab_name}")
            df = pd.read_csv(path_to_file)
            print(f"{tab_name}:{df.shape}")

            if df.shape[0] < sample:
                sampled_df = df
            else:
                sampled_df = df.sample(sample)

            table = wandb.Table(dataframe=sampled_df)
            artifact.add(table, name=tab_name)
            
            if profile:
                #The output of the profile report will be an HTML which we will log to W&B under the artifact made
                data_profile = ProfileReport(df, dark_mode=True, title=tab_name, minimal=True)
                profile_path = f"{tab_name}.html"
                data_profile.to_file(profile_path)
                data_table_profile = wandb.Html(profile_path)
                artifact.add(data_table_profile, f"{tab_name}_profile")
                # artifact.add_file(profile_path)
                
    return None

## Import Prefect

In [None]:
import prefect
from prefect import task, Flow, Parameter
from prefect.run_configs import LocalRun
from prefect.storage import Local #TODO: make this work with local device

## 1. Pull Data

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from zipfile import ZipFile

In [None]:
@task(log_stdout=True)
def download_kaggle_data(competition: str = "tabular-playground-series-mar-2022", project_name: str = "kaggle-tps-mar-2022-odsc", **kwargs):

    print(f"starting new run for {project_name}")
    run = wandb.init(
        project=project_name, job_type="download", name=f"log-{competition}")

    api = KaggleApi()
    api.authenticate()
    api.competition_download_files(competition)
    zip_path = f"{competition}.zip"
    path_to_raw = os.path.join(".", "data", "raw")
    ZipFile(zip_path).extractall(path=path_to_raw)
    os.remove(zip_path)

    # TODO: Remove hack to add data secription
    if competition == "tabular-playground-series-mar-2022":
        data_description = """
            In this competition, you'll forecast twelve-hours of traffic flow in a major U.S. metropolitan area. Time, space, and directional features give you the chance to model interactions across a network of roadways.

            Files and Field Descriptions
            -------------------------------
            train.csv - the training set, comprising measurements of traffic congestion across 65 roadways from April through September of 1991.
            row_id - a unique identifier for this instance
            time - the 20-minute period in which each measurement was taken
            x - the east-west midpoint coordinate of the roadway
            y - the north-south midpoint coordinate of the roadway
            direction - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.
            congestion - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.
            test.csv - the test set; you will make hourly predictions for roadways identified by a coordinate location and a direction of travel on the day of 1991-09-30.
            sample_submission.csv - a sample submission file in the correct format
        """

    raw_data_artifact = wandb.Artifact(
        name="raw", type=competition, description=data_description)
    add_convert_for_wandb(raw_data_artifact, path_to_raw)

    run.log_artifact(raw_data_artifact)
    run.finish()

    return None

## 2. Feature Engineer

In [None]:
import pandas as pd

In [None]:
def feature_engineer(data):
    
    
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['is_month_start'] = data['time'].dt.is_month_start.astype('int')
    data['is_month_end'] = data['time'].dt.is_month_end.astype('int')
    data['hour+minute'] = data['time'].dt.hour * 60 + data['time'].dt.minute
    data['is_weekend'] = (data['time'].dt.dayofweek > 4).astype('int')
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    data['x+y'] = data['x'].astype('str') + data['y'].astype('str')
    data['x+y+direction'] = data['x'].astype('str') + data['y'].astype('str') + data['direction'].astype('str')
    data['hour+direction'] = data['hour'].astype('str') + data['direction'].astype('str')
    data['hour+x+y'] = data['hour'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+direction+x'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str')
    data['hour+direction+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['y'].astype('str')
    data['hour+direction+x+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+x'] = data['hour'].astype('str') + data['x'].astype('str')
    data['hour+y'] = data['hour'].astype('str') + data['y'].astype('str')
    return data

In [None]:
@task(log_stdout=True)
def feature_engineer_tps_2022(competition: str = "tabular-playground-series-mar-2022", project_name: str = "kaggle-tps-mar-2022-odsc", **kwargs):
    run = wandb.init(
        project=project_name, job_type="feature_engineer", name=f"feature_engineer-{competition}")
    comp_data_art = run.use_artifact(f"{project_name}/raw:latest", type=competition)
    comp_data_path = os.path.join(comp_data_art.download(), "data")

    train_path = os.path.join(comp_data_path, "train.csv")
    test_path = os.path.join(comp_data_path, "test.csv")
    submission_path = os.path.join(comp_data_path, "sample_submission.csv")
    
    train_data = pd.read_csv(train_path, dtype={'time': str})
    test_data = pd.read_csv(test_path, dtype={'time': str})
    submission = pd.read_csv(submission_path)

    fe_train_data = feature_engineer(train_data)
    fe_test_data = feature_engineer(test_data)

    local_data_dir = os.path.join(".", "data")
    fe_path = os.path.join(local_data_dir, "fe")
    if not os.path.exists(fe_path):
        os.makedirs(fe_path)
    
    fe_train_data_path = os.path.join(fe_path, "fe_train.csv")
    fe_test_data_path = os.path.join(fe_path, "fe_test.csv")
    
    fe_train_data.to_csv(fe_train_data_path, index=False)
    fe_test_data.to_csv(fe_test_data_path, index=False)
    
    fe_artifact = wandb.Artifact(
        name="feature_engineered", type=competition)
    add_convert_for_wandb(fe_artifact, fe_path)
    
    run.log_artifact(fe_artifact)
    run.finish()
    
    return None

## 3. Model

In [None]:
from pycaret.regression import *

In [None]:
# Accessory function to call the collection of functions needed to convert useful information from the pycaret run into loggable artifacts for lineaging
def perform_experiment(exp):

    # Experiments are run by splitting a data into training and holdout internally, allowing their ability to make comparison

    # Runs an experiment which will compare different model types here and select the best model type
    best_model = compare_models()

    # Return the dataframe that shows the different metrics calculated for each of the tested model types
    leaderboard = get_leaderboard()
    # Get the internal names of the models for referential ID's in a DataFrame
    available_model_types = models()
    # Merge the above Dataframes
    model_comparison_results = leaderboard.reset_index().merge(available_model_types.reset_index(), left_on="Model Name", right_on="Name")
    return model_comparison_results, best_model

In [None]:
#https://pycaret.readthedocs.io/en/latest/api/regression.html
#TODO: Hardcode the relationships between the features and the numeric vs categorical features
def setup_tps_2022_config(seed):
    config = {
        "target": "congestion",
        "fold_strategy" : 'timeseries',
        "session_id": seed,
        "ignore_features" : ["row_id"],
        "log_experiment": "wandb",
#         "transform_target": True,
        "experiment_name": f"tps_march_2022_{seed}",
        "silent": True,
#         "normalize": True,
#         "transformation": True,
        "ignore_low_variance": True,
        "remove_multicollinearity": True,
        "multicollinearity_threshold": 0.95,
        # "use_gpu": True,
    }
    return config

In [None]:
import random

In [None]:
@task(log_stdout=True)
def train_tps_mar_2022_automl_model(competition: str = "tabular-playground-series-mar-2022", 
                                    project_name: str = "kaggle-tps-mar-2022-odsc", 
                                    n: int = 2,
                                    sample = None, **kwargs): 
    print(n)
    print(sample)
    
    for i in range(n):
        #Use seed to create a unique configuration for the current pycaret experiment
        seed = i + 1 + random.randint(0, 1000)

        config = setup_tps_2022_config(seed) #Your specific configs for pycaret data preparation

        #Initialize wandb run to begin logging for pycaret experiment
        run = wandb.init(project=project_name, reinit=True, config = config, job_type="train",
                   name=f"train-seed-{seed}-{competition}")
        print(f"Seed: {seed}")

        #Pull latest training data from wandb and load into df
        fe_data_art = run.use_artifact(f"{project_name}/feature_engineered:latest", type=competition)
        train_data_path = fe_data_art.get_path("data/fe_train.csv").download()
        
        all_train_data = pd.read_csv(train_data_path)#.convert_dtypes()
        
        #Hard coded because broken
        #Forcing small sample to train fast
        #TODO: Add this as an input parameter
        train_data = all_train_data[["row_id", "time", "congestion", "x", "y", "direction"]]
        if sample:
            train_data = train_data.sample(sample, random_state=seed)
        train_data['time'] = pd.to_datetime(train_data['time'])

        print(train_data.shape)

        #setup and run experiment
        #TODO: run with the proper generated features
        ts_exp = setup(data=train_data, **config)
        model_comparison_results, best_model = perform_experiment(ts_exp)

        # save model
        model_title = f"{competition}-{seed}"
        save_model(best_model, model_title)
        
        interpret_model(best_model, save=True)
        
        # generate wandb tables from the results dfs from our experiment
        model_artifacts = wandb.Artifact("model_artifacts", type=competition)

        # add all objects to artifact
        model_artifacts.add_file(f"{model_title}.pkl", name="model.pkl")
        #TODO: log this to run as opposed to artifact
        model_artifacts.add_file("SHAP summary.png")

        run.log_artifact(model_artifacts)

        run.finish()

    return None

## 4. Promote Best Model

In [None]:
# by="R2"
# project_name = "kaggle-tps-mar-2022-odsc"
@task(log_stdout=True)
def select_best_model(competition: str = "tabular-playground-series-mar-2022", project_name: str = "kaggle-tps-mar-2022-odsc", by: str = "R2", **kwargs):
    print(by)
    
    api = wandb.Api({"project": project_name})
    train_runs = api.runs(filters={"jobType": "train"})

    best_run = None
    best_model = None
    best_score = -1000000

    run = wandb.init(project=project_name, job_type="promote",
                    name=f"promote_best_model")
    for train_run in train_runs:
        model_comparison_table = run.use_artifact(f"run-{train_run.id}-compare_models:latest")["compare_models"]
        model_comparison_df = pd.DataFrame(data = model_comparison_table.data, columns=model_comparison_table.columns)
        candidate_score = round(model_comparison_df.sort_values(by=by, ascending=False).iloc[0][by], 3)
        if candidate_score > best_score:
            best_score = candidate_score
            best_run = train_run

    for artifact in best_run.logged_artifacts():
        if artifact.name.startswith("model_artifacts"):
            best_model = run.use_artifact(artifact.name)
            best_model.aliases.append("production")
            best_model.save()

    run.finish()

    return None

## 5. Evaluate

In [None]:
# project_name = "kaggle-tps-mar-2022-odsc"
# competition = "tabular-playground-series-mar-2022"
@task(log_stdout=True)
def predict_on_test_data(competition : str = "tabular-playground-series-mar-2022", project_name : str = "kaggle-tps-mar-2022-odsc", **kwargs):
    run = wandb.init(project=project_name, job_type="evaluate",
                        name=f"evaluate_prod_model")

    fe_data_art = run.use_artifact(f"{project_name}/feature_engineered:latest", type=competition)
    test_data_path = fe_data_art.get_path("data/fe_test.csv").download()
    test_data = pd.read_csv(test_data_path)

    model_arts = run.use_artifact("model_artifacts:production")
    best_model_path = model_arts.get_path("model.pkl").download()
    best_model = load_model(best_model_path.replace(".pkl", ""))

    comp_data_art = run.use_artifact(f"{project_name}/raw:latest", type=competition)
    sample_submission_path = comp_data_art.get_path("data/sample_submission.csv").download()
    validate_by = pd.read_csv(sample_submission_path)

    test_results = test_data.copy(deep=True)
    test_results['time'] = pd.to_datetime(test_results['time'])
    test_results["congestion"] = 0
    unseen_df = test_results[["row_id", "time", "congestion", "x", "y", "direction"]]
    test_results["prediction"] = predict_model(best_model, data = unseen_df).rename({"Label": "prediction"}, axis=1)["prediction"]

    submission = test_results[["row_id", "prediction"]].rename({"prediction": "congestion"}, axis=1)
    if validate_by.any(axis=None) != None:
        val_cols = validate_by.columns
        sub_cols = submission.columns
        if len(val_cols) == len(sub_cols):
            for i in range(len(val_cols)):
                val_cols[i] == sub_cols[i]
        else:
            raise ValueError("Too many columns in submission")

    local_data_dir = os.path.join(".", "data")
    submission_dir = os.path.join(local_data_dir, "sub")
    if not os.path.exists(submission_dir):
        os.makedirs(submission_dir)

    submission.to_csv(os.path.join(submission_dir, "submission.csv"), index=False)

    submission_artifact = wandb.Artifact("submission", type=competition)
    add_convert_for_wandb(submission_artifact, submission_dir)
    
    run.log_artifact(submission_artifact)
    run.finish()

    return None

## Configure Pipeline

In [None]:
def configure_prefect_flow():

    with Flow("odsc-2022-step-3-pycaret-WB-Prefect", storage=Local(add_default_labels=False)) as flow:
        competition = Parameter(
            "competition", default="tabular-playground-series-mar-2022")
        project_name = Parameter(
            "project_name", default="kaggle-tps-mar-2022-odsc")
        num_experiments = Parameter(
            "n", default=3)
        num_training_points = Parameter("sample", default=1000)
        model_selection_metric = Parameter("by", default="R2")

        download_success = download_kaggle_data(competition=competition, project_name=project_name)
        fe_success = feature_engineer_tps_2022(competition=competition, project_name=project_name, download_success=download_success)
        train_success = train_tps_mar_2022_automl_model(competition=competition, project_name=project_name, n=num_experiments, sample=num_training_points, fe_success=fe_success)
        promote_success = select_best_model(competition=competition, project_name=project_name, by=model_selection_metric, train_success=train_success)
        predict_success = predict_on_test_data(competition=competition, project_name=project_name, promote_success=promote_success)

    # Configure the `PROJECT` environment variable for this flow
    flow.run_config = LocalRun(
        env={"KAGGLE_USERNAME": os.environ["KAGGLE_USERNAME"],
             "KAGGLE_KEY": os.environ["KAGGLE_KEY"], "WANDB_API_KEY": os.environ["WANDB_API_KEY"]})

    # return flow
    flow.register(project_name="odsc-east-2022")
    # flow.run()

In [None]:
flow = configure_prefect_flow()

## Run Pipeline *started and orchestrated by Prefect*

In [None]:
!prefect agent local start