In [580]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [581]:
from dukto.pipe import Pipe
from dukto.processor import ColProcessor, MultiColProcessor, Transformer
import pandas as pd
import numpy as np

In [582]:
data  = pd.read_csv('data/ufc.csv', index_col=0)

## ColProcessor
### applies function/s to a column/s  

In [583]:
def convert_foot_to_cm(r):
    if isinstance(r, str) and "'" in r:
        foot, inches = r.split("'")
        inches = int(foot)*12 + int(inches.replace('"', ''))
        return inches*2.54
    return np.nan

def convert_inch_to_cm(r):
    if isinstance(r,str) and '"' in r:
        return int(r.replace('"', '')) * 2.54
    return np.nan

def num_of_num_to_perc(r):
    if isinstance(r,str) and 'of' in r:
        thr, landed = map(int, r.split('of'))
        if landed > 0:
            return thr / landed 
    return np.nan

def pounds_to_kg(r):
    if isinstance(r, str) and 'lbs' in r:
        return int(r.split(' ')[0]) * 0.4535
    return r

In [584]:
single_pipe = [
    ColProcessor(name=['agg_height_first','agg_height_second'], funcs=[convert_foot_to_cm], funcs_test={"6'2\"":187.96}, suffix='_new'),
    ColProcessor(name=['agg_reach_first','agg_reach_second'], funcs=[convert_inch_to_cm], funcs_test={'70"': 177.80}, suffix='_new'),
    ColProcessor(name=['second_total_str', 'first_total_str'], funcs=[num_of_num_to_perc], suffix='_%%_new', funcs_test={'50 of 100':0.5}),
    ColProcessor(name=['agg_dob_first', 'agg_dob_second', 'date_card'], funcs=[pd.to_datetime]),
    ColProcessor(name='agg_weight_first', new_name={"agg_weight_first":'weight_class'}, funcs=[pounds_to_kg], suffix='_new', drop=True)
]

## MultiColProcessor

## applies a function that takes and returns a dataframe
## this class is used to add columns based on other column/s

In [585]:
def add_ages(df):
    df['first_fighter_age_new'] = df['date_card'] - df['agg_dob_first']
    df['second_fighter_age_new'] = df['date_card'] - df['agg_dob_second']
    return df

def ages_in_years(df):
    df[['first_fighter_age_new', 'second_fighter_age_new']] = df[['first_fighter_age_new', 'second_fighter_age_new']].applymap(lambda x:x/np.timedelta64(1, 'Y'))
    return df

In [586]:
multi_pipe = [
    MultiColProcessor(name=['first_fighter_age_new', 'second_fighter_age_new'], funcs=[add_ages, ages_in_years]),
             ]

## Transformer

### applies a feature_engine style transformer to a column/s

In [608]:
# res[[i for i in res.columns if 'new' in i]]

In [None]:
data.assign()

In [621]:
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
# add school name so you can encode it

new_cols_func = lambda x: [i for i in x if (('new' in i) and ('weight' not in i))]

trans_pipe  = [
    Transformer(name_from_func=new_cols_func, transformers=[MeanMedianImputer]),
    MultiColProcessor(funcs=[lambda x:x.assign(weight_class_new=x.weight_class_new.astype(str))]),
    Transformer(name=['weight_class_new'], transformers=[CategoricalImputer,CountFrequencyEncoder]),
]

In [622]:
pipeline = single_pipe+multi_pipe+trans_pipe

In [623]:
trans_pipe

[Transformer(), MultiColProcessor(), Transformer()]

In [624]:
pipe = Pipe(data=data, pipeline=pipeline, run_test_cases=True)

In [625]:
res = pipe.run()

ColProcessor (agg_height_first, agg_height_second) test cases PASSED! 😎
ColProcessor (agg_reach_first, agg_reach_second) test cases PASSED! 😎
ColProcessor (second_total_str, first_total_str) test cases PASSED! 😎
ColProcessor (agg_dob_first, agg_dob_second, date_card) test cases NOT FOUND.
ColProcessor (agg_weight_first)             test cases NOT FOUND.
not implemented yet
not implemented yet


In [630]:
res[[i for i in res.columns if 'new' in i]]

Unnamed: 0,agg_height_first_new,agg_height_second_new,agg_reach_first_new,agg_reach_second_new,second_total_str_%%_new,first_total_str_%%_new,weight_class_new,first_fighter_age_new,second_fighter_age_new
0,193.04,193.04,213.36,195.58,0.452471,0.629412,454,32.559190,30.119715
1,165.10,175.26,167.64,172.72,0.397059,0.695122,382,31.923996,31.113575
2,195.58,182.88,203.20,187.96,0.666667,0.636364,96,28.063547,26.155226
3,172.72,170.18,177.80,180.34,0.547009,0.391892,614,28.978008,28.509826
4,190.50,177.80,200.66,185.42,0.805195,0.465517,31,35.001403,36.534631
...,...,...,...,...,...,...,...,...,...
5918,154.94,160.02,157.48,162.56,0.469880,0.541667,184,27.858204,33.054751
5919,182.88,185.42,195.58,190.50,0.465753,0.398601,1012,27.223009,28.570060
5920,193.04,177.80,200.66,182.88,0.792880,0.537634,96,31.976016,35.294359
5921,172.72,182.88,175.26,177.80,0.636364,0.500000,897,32.367537,32.186835


In [561]:
def avg_grade(df):
    df['avg_grade'] = df[['chem_grade','phy_grade','bio_grade']].mean(axis=1)
    return df

multi_pipe = [
    MultiColProcessor(funcs=[avg_grade], funcs_test={})
] 

In [562]:
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
# add school name so you can encode it

new_cols_func = lambda x: [i for i in x if (('new' in i) and ('weight' not in i))]

transformers = [
    Transformer(name_from_func=new_cols_func, transformers=[MeanMedianImputer]),
    Transformer(name=['weight_class_new'], transformers=[CategoricalImputer,CountFrequencyEncoder]),
]

In [563]:
# add all pipelines
all_pipes = single_pipe + multi_pipe + transformers