# DataOps with ClearML Pt.I 
## (ClearSHOW S02E05)
Simple feature engineering with ClearML as a data store. 


In [None]:
# normal imports
from typing import Optional
import pandas as pd

## First things first, init a task on the project.
#### That's our famous original 2-LOC integration, now with subprojects :)

In [None]:
from clearml import Task, Dataset

task = Task.init(project_name="titanic_demo/FeatureStore", task_name="feature_set_2")


## Now let's get the dataset from our datastore
refer to S02E04 to see how we downloaded it from kaggle and put it there. AKA clearml-data rul3z!!1

In [None]:
tdata = Dataset.get(dataset_project="titanic_demo/dataset")
tdata_folder = tdata.get_local_copy()


### The dataset has been downloaded to the local machine and can now be loaded.
#### Don't forget to uses pandas to read the csv ;) 

# Real stuff starts here 
now we want to make some cleaning and feature engineering [as suggested by](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy) the top 
    [kaggle kernels](https://www.kaggle.com/startupsci/titanic-data-science-solutions)
 for this data

## The follwing defs are meant to look like you copy-pasted it from someone else :) 


In [None]:
def extract_title(in_name_series: pd.Series) -> pd.Series:
    return in_name_series.str.extract(' ([A-Za-z]+)\.', expand=False)

def make_fixed_title_series(
    in_title_series: pd.Series, override_rare_list: Optional[list] = None
) -> pd.Series:
    rare_list = (
        [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ]
        if override_rare_list is None
        else override_rare_list
    )
    fixed_title_series = in_title_series.replace(rare_list, "Rare")
    fixed_title_series = fixed_title_series.replace(["Mlle","Ms"], "Miss")
    fixed_title_series = fixed_title_series.replace("Mme", "Mrs")
    return fixed_title_series

def map_title(title_series: pd.Series, mapping: dict) -> pd.Series:
   rev_mapping = {v:k for k,v in mapping.items()} 
   mapped = title_series.map(rev_mapping)
   # shouldn't happen
   mapped = mapped.fillna(0)
   return mapped

# Create a categorical column for the passenger titles and document your work

In [None]:
train_df = pd.read_csv(tdata_folder+'/train.csv')
train_df.head()

In [None]:
num_to_title = {1: "Mr", 2: "Miss", 3: "Mrs", 4: "Master", 5: "Rare"}
Use_actual_cat = False

def create_categorical_title(
    name_series: pd.Series, num_to_title_mapping: dict
) -> pd.Series:
    title = extract_title(name_series)
    fixed_title = make_fixed_title_series(title)
    cat_title = map_title(fixed_title, num_to_title_mapping)
    if Use_actual_cat:
        cat_title = cat_title.astype("category")
    return cat_title

train_df["Title"] = create_categorical_title(train_df["Name"], num_to_title)




## Make sure you log some of your process in the task itself:

In [None]:
# expose the mapping so that it can be viewed and grabbed later on
task.upload_artifact("key_category_to_title", num_to_title)
# 
sanity_check = train_df[["Title", "Survived"]].groupby(["Title"], as_index=False).mean()
sanity_check = sanity_check.set_index('Title', drop=True)
task.logger.report_table('survival','categorical',table_plot=sanity_check)

# Now let's create a binary diff over the original dataset which contains the new feature
# step 1 - create the feature set


In [None]:
with_feature = Dataset.create('name does not matter - the task is the feature',
                              dataset_project='titanic_example/FeatureStore',
                              parent_datasets=[tdata.id],
                              use_current_task=True)   # This boolean is the main point actually!!!

In [None]:
from tempfile import mkdtemp
new_folder = with_feature.get_mutable_local_copy(mkdtemp())
print(f'new_folder is:{new_folder}')
# overwrite with new train df (with the added)
train_df.to_csv(new_folder+'/train.csv', index=False)
with_feature.sync_folder(new_folder)
with_feature.upload()
with_feature.finalize()



# Wait, is that it?!
