In [649]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [650]:
from dukto.pipe import Pipe
from dukto.processor import ColProcessor, MultiColProcessor, Transformer
import pandas as pd
import numpy as np
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer

In [651]:
data  = pd.read_csv('data/ufc.csv', index_col=0)

# ColProcessor
### applies function/s to a column/s  

In [652]:
def convert_foot_to_cm(r):
    if isinstance(r, str) and "'" in r:
        foot, inches = r.split("'")
        inches = int(foot)*12 + int(inches.replace('"', ''))
        return inches*2.54
    return np.nan

def convert_inch_to_cm(r):
    if isinstance(r,str) and '"' in r:
        return int(r.replace('"', '')) * 2.54
    return np.nan

def num_of_num_to_perc(r):
    if isinstance(r,str) and 'of' in r:
        thr, landed = map(int, r.split('of'))
        if landed > 0:
            return thr / landed 
    return np.nan

def pounds_to_kg(r):
    if isinstance(r, str) and 'lbs' in r:
        return int(r.split(' ')[0]) * 0.4535
    return r

In [653]:
single_pipe = [
    ColProcessor(name=['agg_height_first','agg_height_second'], 
                 funcs=[convert_foot_to_cm], funcs_test={"6'2\"":187.96}, suffix='_new'),
    
    ColProcessor(name=['agg_reach_first','agg_reach_second'], 
                 funcs=[convert_inch_to_cm], funcs_test={'70"': 177.80}, suffix='_new'),
    
    ColProcessor(name=['second_total_str', 'first_total_str'], 
                 funcs=[num_of_num_to_perc], suffix='_%%_new', funcs_test={'50 of 100':0.5}),
    
    ColProcessor(name=['agg_dob_first', 'agg_dob_second', 'date_card'], 
                 funcs=[pd.to_datetime]),
    
    ColProcessor(name='agg_weight_first', new_name={"agg_weight_first":'weight_class'}, 
                 funcs=[pounds_to_kg], suffix='_new', drop=True)
]

## MultiColProcessor

## applies a function that takes and returns a pandas DataFrame
## this class is used to add columns based on other column/s

In [654]:
def add_ages(df):
    df['first_fighter_age_new'] = df['date_card'] - df['agg_dob_first']
    df['second_fighter_age_new'] = df['date_card'] - df['agg_dob_second']
    return df

def ages_in_years(df):
    df[['first_fighter_age_new', 'second_fighter_age_new']] = df[['first_fighter_age_new', 'second_fighter_age_new']].applymap(lambda x:x/np.timedelta64(1, 'Y'))
    return df

In [655]:
multi_pipe = [
    MultiColProcessor(name=['first_fighter_age_new', 'second_fighter_age_new'], 
                      funcs=[add_ages, ages_in_years]),
             ]

## Transformer

### applies a feature_engine style transformer to a column/s

In [656]:

new_cols_func = lambda x: [i for i in x if (('new' in i) and ('weight' not in i))]

trans_pipe  = [
    Transformer(name_from_func=new_cols_func, 
                transformers=[MeanMedianImputer]),
    
    MultiColProcessor(funcs=[lambda x:x.assign(weight_class_new=x.weight_class_new.astype(str))]),
    
    Transformer(name=['weight_class_new'], 
                transformers=[CategoricalImputer,CountFrequencyEncoder]),
]

In [657]:
pipeline = single_pipe+multi_pipe+trans_pipe

In [658]:
pipeline

[ColProcessor(agg_height_first, agg_height_second),
 ColProcessor(agg_reach_first, agg_reach_second),
 ColProcessor(second_total_str, first_total_str),
 ColProcessor(agg_dob_first, agg_dob_second, date_card),
 ColProcessor(agg_weight_first),
 MultiColProcessor(first_fighter_age_new, second_fighter_age_new),
 Transformer(),
 MultiColProcessor(),
 Transformer()]

In [659]:
pipe = Pipe(data=data, pipeline=pipeline, run_test_cases=True)

In [660]:
res = pipe.run()

ColProcessor (agg_height_first, agg_height_second) test cases PASSED! 😎
ColProcessor (agg_reach_first, agg_reach_second) test cases PASSED! 😎
ColProcessor (second_total_str, first_total_str) test cases PASSED! 😎
ColProcessor (agg_dob_first, agg_dob_second, date_card) test cases NOT FOUND.
ColProcessor (agg_weight_first)             test cases NOT FOUND.
Multi test not implemented yet
transformer test not implemented yet
Multi test not implemented yet
transformer test not implemented yet


# after 

In [666]:
res[[i for i in res.columns if 'new' in i]].head(3)

Unnamed: 0,agg_height_first_new,agg_height_second_new,agg_reach_first_new,agg_reach_second_new,second_total_str_%%_new,first_total_str_%%_new,weight_class_new,first_fighter_age_new,second_fighter_age_new
0,193.04,193.04,213.36,195.58,0.452471,0.629412,454,32.55919,30.119715
1,165.1,175.26,167.64,172.72,0.397059,0.695122,382,31.923996,31.113575
2,195.58,182.88,203.2,187.96,0.666667,0.636364,96,28.063547,26.155226


# Before

In [665]:
data.head(3)

Unnamed: 0,agg_dob_first,agg_dob_second,agg_height_first,agg_height_second,agg_reach_first,agg_reach_second,agg_stand_first,agg_stand_second,agg_str_acc_first,agg_str_acc_second,...,date_card,first_fighter_res,first_sig_str_,first_sig_str_percentage,first_total_str,method,second_sig_str_percentage,second_total_str,time,type
0,19-Jul-87,26-Dec-89,"6' 4""","6' 4""","84""","77""",Orthodox,Southpaw,57%,50%,...,8-Feb-20,W,104 of 166,62%,107 of 170,Decision - Unanimous,44%,119 of 263,5:00,belt
1,7-Mar-88,28-Dec-88,"5' 5""","5' 9""","66""","68""",Southpaw,Orthodox,51%,35%,...,8-Feb-20,W,40 of 65,61%,57 of 82,KO/TKO,30%,27 of 68,1:03,belt
2,16-Jan-92,13-Dec-93,"6' 5""","6' 0""","80""","74""",Orthodox,Southpaw,55%,55%,...,8-Feb-20,L,7 of 11,63%,7 of 11,KO/TKO,66%,10 of 15,1:59,
