![title](../assets/problem.png)

In [None]:
import json
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import plotly.express as px
from typing import Dict, List, Union, Any
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
pd.set_option('max_colwidth', 5000)

In [None]:
BASE_PATH = "/Users/seanariel/Desktop/la-maniee/data/mlops"

PATH_TO_SYNTHETIC_DATA = f"{BASE_PATH}/synthetic_data_contract.csv"
PATH_TO_EXPLODED_FEATURES = f"{BASE_PATH}/exploded_features.csv"
PATH_TO_FEATURE_STORE = f"{BASE_PATH}/feature_store.csv"
PATH_TO_DEV_TRAINING_DATA = f"{BASE_PATH}/dev_training.csv"
PATH_TO_DEV_TESTING_DATA = f"{BASE_PATH}/dev_testing.csv"
PATH_TO_AUTOML_TRAINING_DATA = f"{BASE_PATH}/automl_training.csv"
PATH_TO_PRECISION_RECALL = f"{BASE_PATH}/precision_recall.csv"
PATH_TO_OPTIMAL_MODEL = f"{BASE_PATH}/optimal_model.pickle"
PATH_TO_PRODUCTION_MODEL = f"{BASE_PATH}/production_model.pickle"
PATH_TO_TRAINING_DATA = f"{BASE_PATH}/training.csv"
PATH_TO_EXPERIMENTATION_DATA = f"{BASE_PATH}/experimentation.csv"

# Table of Content:
* [Overview](#first-bullet)
* [Feature Engineering](#second-bullet)
* [Model Development](#third-bullet)
* [Model Training](#fourth-bullet)
* [Model Serving](#fifth-bullet)
* [Model Experimentation](#sixth-bullet)

# Overview  <a class="anchor" id="first-bullet"></a>

### Load the data

In [None]:
synthetic_game_data = pd.read_csv(PATH_TO_SYNTHETIC_DATA)

### Get a first feeling of the features

In [None]:
synthetic_game_data.head(1)

In [None]:
synthetic_game_data.info()

### Define the statics

In [None]:
SUITS = ["clubs", "diamonds", "hearts", "spades"]
CARDS = ["seven", "eight", "nine", "ten", "jack", "queen", "king", "ace"]

FEATURE_NAMES = [
    "feature_count_of_suit",
    "feature_count_of_cards",
    "feature_belote_rebelote_points",
    "feature_tierce_plus_points"
]

feature_count_of_suit = [f"has_x_cards_in_suit_{suit}" for suit in SUITS]
feature_count_of_cards = [f"has_x_{card}s" for card in CARDS]
feature_belote_rebelote_points = [f"has_BR_at_{suit}" for suit in SUITS]
feature_tierce_plus_points = [f"has_tierce_at_{suit}" for suit in SUITS]

FEATURES_EXPLODED = {
    "feature_count_of_suit": feature_count_of_suit,
    "feature_count_of_cards": feature_count_of_cards,
    "feature_belote_rebelote_points": feature_belote_rebelote_points,
    "feature_tierce_plus_points": feature_tierce_plus_points
}

# Feature Engineering <a class="anchor" id="second-bullet"></a>

###  Set up the feature store

In [None]:
base_features_df = pd.DataFrame(
    columns=(
        feature_count_of_suit +
        feature_count_of_cards + 
        feature_belote_rebelote_points + 
        feature_tierce_plus_points
    )
)

### Write the processing helpers

In [None]:
"""
We need to convert the raw features to a proper format,
so that it is ingestable by the ML Model
"""

from functools import wraps
import time


def timing(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper



FEATURE_CONVERTER_ENDPOINT = "http://34.77.247.189:1337/cards_to_features"


def clean_feature(hand: str, return_as_scalar: bool = True) -> Dict[str, int]:
    """
    In:
    {
        "raw_hand": "8H.KC.QH.9D.QC.TC.7H.QD.AH.JD.TH.TS"
    }
    Out:
        has_3_cards_in_suit_clubs
        has_2_cards_in_suit_diamonds
        has_5_cards_in_suit_hearts
        has_1_cards_in_suit_spades
        [...]
    """
    response = requests.post(
        FEATURE_CONVERTER_ENDPOINT, 
        json={"raw_hand": hand, "return_as_scalar": return_as_scalar}
    )
    features: Dict = response.json()
    return features

### Launch simple sequential processing   

In [None]:
# %%timeit

SAMPLE = 1000
VERBOSE = True


@timing
def sequential_requests() -> pd.DataFrame:
    features_df_copy = base_features_df.copy()
    idx = 0
    for raw_feature in synthetic_game_data["raw_features"].values[0:SAMPLE]:
        output = clean_feature(raw_feature, return_as_scalar=False)
        features_df_copy = pd.concat([features_df_copy, pd.DataFrame(output)])
        idx += 1
        if idx % 100 == 0 and VERBOSE:
            print(idx)
    return features_df_copy

features_df_copy = sequential_requests()

In [None]:
print(features_df_copy.shape)
features_df_copy.head()

### (1/2) Parallelize the workload

In [None]:
"""
This should improve significantly the processing time
"""

import multiprocess as mp


SAMPLE = 1000
VERBOSE = True


@timing
def multiprocessing_requests() -> pd.DataFrame:
    features_df = base_features_df.copy()
    with mp.Pool(5) as p:
        output = p.map(clean_feature, synthetic_game_data["raw_features"].values[0:SAMPLE])
    features_df = pd.concat([features_df, pd.DataFrame(output)])
    return features_df

features_df = multiprocessing_requests()

In [None]:
print(features_df.shape)
features_df.head()

### (2/2) Parallelize the workload

In [None]:
import asyncio
import aiohttp
from aiohttp import ClientSession, ClientConnectorError


SAMPLE = 1000
VERBOSE = True


async def make_request(hand: str, session: ClientSession, **kwargs) -> tuple:
    try:
        resp = await session.request(
            method="POST", 
            url=FEATURE_CONVERTER_ENDPOINT, 
            json={"raw_hand": hand, "return_as_scalar": True}, 
            **kwargs
        )
    except ClientConnectorError:
        return (hand, 404)
    feature = await resp.json()
    return feature


async def async_requests(hands: set, **kwargs) -> None:
    async with ClientSession() as session:
        tasks = []
        for hand in hands:
            tasks.append(
                make_request(hand=hand, session=session, **kwargs)
            )
        results = await asyncio.gather(*tasks)
    return results


start_time = time.perf_counter()
features_df = base_features_df.copy()
hands = synthetic_game_data["raw_features"].values[0:SAMPLE]
output = await async_requests(hands)
features_df = pd.concat([features_df, pd.DataFrame(output)])
end_time = time.perf_counter()
total_time = end_time - start_time
print(f'Function async_requests() () Took {total_time:.4f} seconds')

In [None]:
print(features_df.shape)
features_df.head()

### Build custom features

In [None]:
"""
The DS didn't think it through.
Let's add our own custom features for model enhancement
"""

def feature_total_BR_points(features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Sum of all the Belote Rebelote declaration points
    
    In:
    has_BR_at_clubs | has_BR_at_diamonds | has_BR_at_hearts | has_BR_at_spades
    0                 20                   20                 0
    Out:
    total_BR_points 
    40
    """
    features_df["total_BR_points"] = (
        features_df["has_BR_at_clubs"] +
        features_df["has_BR_at_diamonds"] +
        features_df["has_BR_at_hearts"] +
        features_df["has_BR_at_spades"]
    )
    return features_df


def feature_total_tierce_points(features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Sum of all the tierce announced points
    
    In:
    has_tierce_at_clubs | has_tierce_at_diamonds | has_tierce_at_hearts | has_tierce_at_spades
    20                    0                        0                      20
    Out:
    total_tierce_points 
            40
    """
    features_df["total_tierce_points"] = (
        features_df["has_tierce_at_clubs"] +
        features_df["has_tierce_at_diamonds"] +
        features_df["has_tierce_at_hearts"] +
        features_df["has_tierce_at_spades"]
    )
    return features_df

def feature_total_AnD_points(features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Sum of the announced & declared points
    
    In:
    total_BR_points | total_tierce_points
    40                40
    Out:
    total_AnD_points 
    80
    """
    features_df["total_AnD_points"] = (
        features_df["total_BR_points"] + 
        features_df["total_tierce_points"]
    )
    return features_df

def merge_synthetic_and_features(synthetic_game_data: pd.DataFrame, features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge base set of features with newly build ones
    
    In:
    contract | reward ... + has_x_cards_in_suit_clubs | has_x_cards_in_suit_diamonds ...
    
    Out:
    contract | reward | has_x_cards_in_suit_clubs | has_x_cards_in_suit_diamonds ...
    """
    merged_df = pd.concat(
        [
            synthetic_game_data[
                ["contract", "reward", "last_bidder", "starter", "p1_face_value", "p2_face_value"]
            ].reset_index(drop=True), 
            features_df.reset_index(drop=True)
        ], 
        axis=1
    )
    return merged_df

def encode_contract(merged_df: pd.DataFrame) -> pd.DataFrame:
    """
    One hot encoding of the contract
    
    In:
    contract | ...
    spades
    hearts
    clubs
    
    Out:
    clubs | diamonds | hearts | spades | ...
    0       0          0        1
    0       0          1        0
    1       0          0        0
    """
    return pd.concat([merged_df, pd.get_dummies(merged_df.contract)], axis=1)

def categorize_reward(merged_df: pd.DataFrame) -> pd.DataFrame:
    """
    Turn the rewards into a binary stating whether P1 has won
    
    In:
    reward | ...
    -10
    0
    10
    
    Out:
    p1_has_won | ...
    0
    0
    1
    """
    merged_df["p1_has_won"] = merged_df["reward"].apply(
        lambda reward: 1 if reward > 0 else 0
    )
    return merged_df

def build_custom_features(synthetic_game_data: pd.DataFrame, features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Pipelining of all processing steps
    """
    features_df = feature_total_BR_points(features_df)
    features_df = feature_total_tierce_points(features_df)
    features_df = feature_total_AnD_points(features_df)
    merged_df = merge_synthetic_and_features(synthetic_game_data, features_df)
    merged_df = encode_contract(merged_df)
    merged_df = categorize_reward(merged_df)
    return merged_df

feature_store = build_custom_features(synthetic_game_data, features_df)

In [None]:
feature_store.head()

### Run a Unit Testing Suite

In [None]:
def features_df_inputs() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "has_BR_at_clubs": [1, 2, 3],
            "has_BR_at_diamonds": [1, 2, 3],
            "has_BR_at_hearts": [1, 2, 3],
            "has_BR_at_spades": [1, 2, 3],
        }
    )


def features_df_outputs() -> pd.DataFrame:
    return pd.DataFrame(
        {
            "has_BR_at_clubs": [1, 2, 3],
            "has_BR_at_diamonds": [1, 2, 3],
            "has_BR_at_hearts": [1, 2, 3],
            "has_BR_at_spades": [1, 2, 3],

            "total_BR_points": [4, 8, 12],
        }
    )


def test_custom_feature_total_br_points_works_correctly(features_df_inputs, features_df_outputs):
    processed_output = feature_total_BR_points(features_df_inputs)
    pd.testing.assert_frame_equal(
        features_df_outputs,
        processed_output
    )
    

test_custom_feature_total_br_points_works_correctly(features_df_inputs(), features_df_outputs())

print("All the tests in the suite have run properly")

In [None]:
feature_store.to_csv(PATH_TO_FEATURE_STORE, index=False)

### Push the codebase to GH Repository

For the next class (model development) we will need the codebase to be available on GH. 

Follow this <a> GitHub Lab </a> to set this up.

### [Optional] Assignment 1 - Google Big Query Lab

Let's take this party to <a>Google Big Query</a> for the Analytics heavy lifting!

Follow the lab and generate the following analysis:
- Average reward by initial A&D points
- Average reward by initial cards in suit
- Average reward by starting position
- Average reward by bidding position
- Reward vs total points
- Average spades A&D points by number of cards in spaces

### [Optional] Assignment 2 - Best Practices Lab

Let's get some good coding practices early in. This should ensure the readibility of the scripts we right, which translate in more efficiency when working in a team. 

Let's head over to <a>Best Practices Tools</a> for a more detailed review of the subject.

#### Credit

Note:
This content has been developed by Sean Ariel. It is a practical training that cannot be copied, reproduced, distributed without the explicit consent from the author. © Sean Ariel