# DataOps with ClearML Pt.I 
## (ClearSHOW S02E05)
Simple feature engineering with ClearML as a data store. 


In [3]:
# normal imports
from typing import Optional
import pandas as pd
import numpy as np


## First things first, init a task on the project.

In [6]:
from clearml import Task, Dataset

task = Task.init(project_name="titanic_demo", task_name="demo_dataset_access")


ClearML Task: overwriting (reusing) task id=b10b4ee8936e4034a8d15ebee5bd71c6
ClearML results page: https://app.community.clear.ml/projects/955d83fee7564b88a595180d098d03d4/experiments/b10b4ee8936e4034a8d15ebee5bd71c6/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


#### That's our famous original 2-LOC integration :) 
## Now let's get the dataset from our datastore
refer to S02E04 to see how we downloaded it from kaggle and put it there. AKA clearml-data rul3z!!1

In [7]:
tdata = Dataset.get(dataset_project="titanic_demo/dataset", dataset_name="titanic")
tdata_folder = tdata.get_local_copy()


### The dataset has been downloaded to the local machine and can now be loaded.
Don't forget to uses pandas to read the csv ;) 

# Real stuff starts here 
now we want to make some cleaning and feature engineering as suggested by the top kaggle kernels for this data
(links below_)

In [32]:
def extract_title(in_name_series: pd.Series) -> pd.Series:
    return in_name_series.str.extract(' ([A-Za-z]+)\.', expand=False)

def make_fixed_title_series(
    in_title_series: pd.Series, override_rare_list: Optional[list] = None
) -> pd.Series:
    rare_list = (
        [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ]
        if override_rare_list is None
        else override_rare_list
    )
    fixed_title_series = in_title_series.replace(rare_list, "Rare")
    fixed_title_series = fixed_title_series.replace(["Mlle","Ms"], "Miss")
    fixed_title_series = fixed_title_series.replace("Mme", "Mrs")
    return fixed_title_series

def map_title(title_series: pd.Series, mapping: dict) -> pd.Series:
   rev_mapping = {v:k for k,v in mapping.items()} 
   mapped = title_series.map(rev_mapping)
   # shouldn't happen
   mapped = mapped.fillna(0)
   return mapped

In [14]:
train_df = pd.read_csv(tdata_folder+'/train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
num_to_title = {1: "Mr", 2: "Miss", 3: "Mrs", 4: "Master", 5: "Rare"}
# expose the mapping so that it can be viewed and grabbed later on
task.upload_artifact("key_category_to_title", num_to_title)


def create_categorical_title(
    name_series: pd.Series, num_to_title_mapping: dict
) -> pd.Series:
    title = extract_title(name_series)
    fixed_title = make_fixed_title_series(title)
    cat_title = map_title(fixed_title, num_to_title_mapping)
    return cat_title.astype("category")


train_df["Title"] = create_categorical_title(train_df["Name"], num_to_title)
# test:
train_df[["Title", "Survived"]].groupby(["Title"], as_index=False).mean()


Unnamed: 0,Title,Survived
0,1,0.156673
1,2,0.702703
2,3,0.793651
3,4,0.575
4,5,0.347826


# Now let's create a binary diff over the original dataset which contains the new feature
# step 1 - create the feature set


In [41]:
feature_set = Dataset.create('titanic_with_category_title',dataset_project='titanic_example/FeatureStore',parent_datasets=[tdata.id])

In [43]:
new_dataset = feature_set.get_mutable_local_copy()


[0;31mSignature:[0m
[0mfeature_set[0m[0;34m.[0m[0mget_mutable_local_copy[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtarget_folder[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moverwrite[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mraise_on_error[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
return a base folder with a writable (mutable) local copy of the entire dataset
    download and copy / soft-link, files from all the parent dataset versions

:param target_folder: Target folder for the writable copy
:param overwrite: If True, recursively delete the target folder before creating a copy.
    If False (default) and target folder contains files, raise exception or return None
:param raise_on_error: If True raise exception if dataset merging failed on any file
:return: A the target folder containing the entire dataset
[0;31mFile:[0m      ~/work/events/venv_demos/lib/p