[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1IHfKKWBjqxuMLAFAP_QjcJ2lfEyYYI9V?usp=sharing)

## Colab Setup

In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [2]:
%%capture
if IN_COLAB:
    
    #Remove not needed python versions to free space
    !rm -rf "/usr/local/lib/python2.7"
    !rm -rf "/usr/lib/python2.7"

    # Clone the repo.
    # !git clone ""

    # Change the working directory to the repo root.
    # %cd

    # Add the repo root to the Python path.
    # import sys, os
    # sys.path.append(os.getcwd())
    
    #Install packages not native to colab
    !pip install python-dotenv
    !pip install pycaret

    #Mount GDrive to access .env file
    from google.colab import drive
    drive.mount('/content/gdrive')

    #Load env file
    #NOTE: gdrive wont allow you to mount dotfiles
    from dotenv import load_dotenv
    load_dotenv("./gdrive/MyDrive/ODSC 2022/my_env_file")

## 1. Pull Data

In [3]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from zipfile import ZipFile

In [4]:
def download_kaggle_data(competition: str = "tabular-playground-series-mar-2022"):

    api = KaggleApi()
    api.authenticate()
    api.competition_download_files(competition)
    zip_path = f"{competition}.zip"
    path_to_raw = os.path.join(".", "data", "raw")
    ZipFile(zip_path).extractall(path=path_to_raw)
    os.remove(zip_path)

    return path_to_raw

## 2. Feature Engineer

In [5]:
import pandas as pd

In [6]:
def feature_engineer(data):
    
    
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['is_month_start'] = data['time'].dt.is_month_start.astype('int')
    data['is_month_end'] = data['time'].dt.is_month_end.astype('int')
    data['hour+minute'] = data['time'].dt.hour * 60 + data['time'].dt.minute
    data['is_weekend'] = (data['time'].dt.dayofweek > 4).astype('int')
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    data['x+y'] = data['x'].astype('str') + data['y'].astype('str')
    data['x+y+direction'] = data['x'].astype('str') + data['y'].astype('str') + data['direction'].astype('str')
    data['hour+direction'] = data['hour'].astype('str') + data['direction'].astype('str')
    data['hour+x+y'] = data['hour'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+direction+x'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str')
    data['hour+direction+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['y'].astype('str')
    data['hour+direction+x+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+x'] = data['hour'].astype('str') + data['x'].astype('str')
    data['hour+y'] = data['hour'].astype('str') + data['y'].astype('str')
    return data

In [7]:
def feature_engineer_tps_2022(comp_data_path: str):
    
    train_path = os.path.join(comp_data_path, "train.csv")
    test_path = os.path.join(comp_data_path, "test.csv")
    submission_path = os.path.join(comp_data_path, "sample_submission.csv")
    
    train_data = pd.read_csv(train_path, dtype={'time': str})
    test_data = pd.read_csv(test_path, dtype={'time': str})
    submission = pd.read_csv(submission_path)

    fe_train_data = feature_engineer(train_data)
    fe_test_data = feature_engineer(test_data)
    
    return fe_train_data, fe_test_data, submission

## 3. Model

In [8]:
from pycaret.regression import *

  defaults = yaml.load(f)


In [None]:
# Accessory function to call the collection of functions needed to convert useful information from the pycaret run into loggable artifacts for lineaging
def perform_experiment(exp):

    # Experiments are run by splitting a data into training and holdout internally, allowing their ability to make comparison

    # Runs an experiment which will compare different model types here and select the best model type
    best_model = compare_models()

    # Return the dataframe that shows the different metrics calculated for each of the tested model types
    leaderboard = get_leaderboard()
    # Get the internal names of the models for referential ID's in a DataFrame
    available_model_types = models()
    # Merge the above Dataframes
    model_comparison_results = leaderboard.reset_index().merge(available_model_types.reset_index(), left_on="Model Name", right_on="Name")
    return model_comparison_results, best_model

In [None]:
#https://pycaret.readthedocs.io/en/latest/api/regression.html
#TODO: Hardcode the relationships between the features and the numeric vs categorical features
def setup_tps_2022_config(seed):
    config = {
        "target": "congestion",
        "fold_strategy" : 'timeseries',
        "session_id": seed,
        "ignore_features" : ["row_id"],
#         "transform_target": True,
        "experiment_name": f"tps_march_2022_{seed}",
        "silent": True,
#         "normalize": True,
#         "transformation": True,
        "ignore_low_variance": True,
        "remove_multicollinearity": True,
        "multicollinearity_threshold": 0.95,
        # "use_gpu": True,
    }
    return config

In [None]:
import random

In [None]:
def train_tps_mar_2022_automl_model(all_train_data: pd.DataFrame, 
                                    n: int = 2,
                                    sample = None): 
    model_runs = []
    for i in range(n):
        #Use seed to create a unique configuration for the current pycaret experiment
        seed = i + 1 + random.randint(0, 1000)

        config = setup_tps_2022_config(seed) #Your specific configs for pycaret data preparation

        print(f"Seed: {seed}")
        
        #Hard coded because broken
        #Forcing small sample to train fast
        train_data = all_train_data[["row_id", "time", "congestion", "x", "y", "direction"]]
        if sample:
            train_data = train_data.sample(sample, random_state=seed)
        train_data['time'] = pd.to_datetime(train_data['time'])

        print(train_data.shape)

        #setup and run experiment
        #TODO: run with the proper generated features
        ts_exp = setup(data=train_data, **config)
        model_comparison_results, best_model = perform_experiment(ts_exp)

        model_runs.append(
            {"seed": seed, 
             "config": config, 
             "run_results": {
                 "model_comparison_results": model_comparison_results,
                 "best_model": best_model
                 }
            }
        )
    print(model_runs)

    return model_runs

## 4. Promote Best Model

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# by="R2"
def select_best_model(model_runs: dict, by: str = "R2"):
    best_model = None
    best_score = -1000000
    for model_run in model_runs:
        model_comparison_table = model_run["run_results"]["model_comparison_results"]
        candidate_score = round(model_comparison_table.sort_values(by=by, ascending=False).iloc[0][by], 3)
        if candidate_score > best_score:
            best_score = candidate_score
            best_model = model_run["run_results"]["best_model"]
    return best_model

## 5. Evaluate

In [None]:
# test_data = fe_test_data
# validate_by = sample_submission

def predict_on_test_data(best_model, test_data: pd.DataFrame, validate_by = None):
    test_results = test_data.copy(deep=True)
    test_results['time'] = pd.to_datetime(test_results['time'])
    test_results["congestion"] = 0
    unseen_df = test_results[["row_id", "time", "congestion", "x", "y", "direction"]]
    test_results["prediction"] = predict_model(best_model, data = unseen_df).rename({"Label": "prediction"}, axis=1)["prediction"]

    submission = test_results[["row_id", "prediction"]].rename({"prediction": "congestion"}, axis=1)
    if validate_by.any(axis=None) != None:
        val_cols = validate_by.columns
        sub_cols = submission.columns
        if len(val_cols) == len(sub_cols):
            for i in range(len(val_cols)):
                val_cols[i] == sub_cols[i]
        else:
            raise ValueError("Too many columns in submission")
    return submission

## Run Pipeline

In [None]:
# def e2e_tps_2022():
path_to_raw = download_kaggle_data()
fe_train_data, fe_test_data, sample_submission = feature_engineer_tps_2022(path_to_raw)
model_runs = train_tps_mar_2022_automl_model(fe_train_data, n=3, sample=1000)
best_model = select_best_model(model_runs, by="R2")
submission = predict_on_test_data(best_model, fe_test_data, validate_by=sample_submission)

# return submission

In [None]:
submission