In [138]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [139]:
sample_data = pd.read_csv('tests/data/test_ufc.csv')

In [154]:
np.isnan(24.354) 

False

In [160]:
single_pipe = [
        ColProcessor(
            name="agg_weight_first",
            new_name={"agg_weight_first": "weight_class"},
            funcs=[pounds_to_kg, lambda x:int(x) if not np.isnan(x) else 0],
            suffix="_new",
            drop=True,
        ),
    ]

In [161]:
pipe = Pipe(data=sample_data, pipeline=single_pipe, run_test_cases=False)
res = pipe.run()

running <function pounds_to_kg at 0x00000189E85BF5E8> 
 0    205 lbs.
1         NaN
2    170 lbs.
3    155 lbs.
4    125 lbs.
Name: agg_weight_first, dtype: object
running <function <lambda> at 0x00000189E862CDC8> 
 0    92.9675
1        NaN
2    77.0950
3    70.2925
4    56.6875
Name: agg_weight_first, dtype: float64


In [162]:
single_pipe[0].funcs = single_pipe[0].funcs[0]

In [163]:
res.weight_class_new

0     92
1      0
2     77
3     70
4     56
5     92
6     83
7     65
8     77
9     92
10    77
11    61
12    70
13    65
14    83
15    61
16    83
17    65
18     0
19    92
Name: weight_class_new, dtype: int64

In [71]:
from dukto.pipe import Pipe
from dukto.processor import ColProcessor, MultiColProcessor, Transformer
import pandas as pd
import numpy as np
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer

In [72]:
data  = pd.read_csv('data/ufc.csv', index_col=0)

# ColProcessor
### applies function/s to a column/s  

In [73]:
def convert_foot_to_cm(r):
    if isinstance(r, str) and "'" in r:
        foot, inches = r.split("'")
        inches = int(foot)*12 + int(inches.replace('"', ''))
        return inches*2.54
    return np.nan

def convert_inch_to_cm(r):
    if isinstance(r,str) and '"' in r:
        return int(r.replace('"', '')) * 2.54
    return np.nan

def num_of_num_to_perc(r):
    if isinstance(r,str) and 'of' in r:
        thr, landed = map(int, r.split('of'))
        if landed > 0:
            return thr / landed 
    return np.nan

def pounds_to_kg(r):
    if isinstance(r, str) and 'lbs' in r:
        return int(r.split(' ')[0]) * 0.4535
    return r

In [74]:
single_pipe = [
    ColProcessor(name=['agg_height_first','agg_height_second'], 
                 funcs=[convert_foot_to_cm], funcs_test={"6'2\"":187.96}, suffix='_new'),
    
    ColProcessor(name=['agg_reach_first','agg_reach_second'], 
                 funcs=[convert_inch_to_cm], funcs_test={'70"': 177.80}, suffix='_new'),
    
    ColProcessor(name=['second_total_str', 'first_total_str'], 
                 funcs=[num_of_num_to_perc], suffix='_%%_new', funcs_test={'50 of 100':0.5}),
    
    ColProcessor(name=['agg_dob_first', 'agg_dob_second', 'date_card'], 
                 funcs=[pd.to_datetime]),
    
    ColProcessor(name='agg_weight_first', new_name={"agg_weight_first":'weight_class'}, 
                 funcs=[pounds_to_kg, int], suffix='_new', drop=True)
]

In [45]:
pipe = Pipe(data=sample_data, pipeline=single_pipe, run_test_cases=False)
res = pipe.run()

In [46]:
res.weight_class_new

0     92.9675
1         NaN
2     77.0950
3     70.2925
4     56.6875
5     92.9675
6     83.8975
7     65.7575
8     77.0950
9     92.9675
10    77.0950
11    61.2225
12    70.2925
13    65.7575
14    83.8975
15    61.2225
16    83.8975
17    65.7575
18        NaN
19    92.9675
Name: weight_class_new, dtype: float64

## MultiColProcessor

## applies a function that takes and returns a pandas DataFrame
## this class is used to add columns based on other column/s

In [20]:
def add_ages(df):
    df['first_fighter_age_new'] = df['date_card'] - df['agg_dob_first']
    df['second_fighter_age_new'] = df['date_card'] - df['agg_dob_second']
    return df

def ages_in_years(df):
    df[['first_fighter_age_new', 'second_fighter_age_new']] = df[['first_fighter_age_new', 'second_fighter_age_new']].applymap(lambda x:x/np.timedelta64(1, 'Y'))
    return df

In [21]:
multi_pipe = [
    MultiColProcessor(name=['first_fighter_age_new', 'second_fighter_age_new'], 
                      funcs=[add_ages, ages_in_years]),
             ]

## Transformer

### applies a feature_engine style transformer to a column/s

In [22]:

new_cols_func = lambda x: [i for i in x if (('new' in i) and ('weight' not in i))]

trans_pipe  = [
    Transformer(name_from_func=new_cols_func, 
                transformers=[MeanMedianImputer], imputation_method='median'),
    
    MultiColProcessor(funcs=[lambda x:x.assign(weight_class_new=x.weight_class_new.astype(str))]),
    
    Transformer(name=['weight_class_new'], 
                transformers=[CategoricalImputer,CountFrequencyEncoder]),
]

In [23]:
pipeline = single_pipe+multi_pipe+trans_pipe

In [24]:
pipeline

[ColProcessor(agg_height_first, agg_height_second),
 ColProcessor(agg_reach_first, agg_reach_second),
 ColProcessor(second_total_str, first_total_str),
 ColProcessor(agg_dob_first, agg_dob_second, date_card),
 ColProcessor(agg_weight_first),
 MultiColProcessor(first_fighter_age_new, second_fighter_age_new),
 Transformer(),
 MultiColProcessor(),
 Transformer()]

In [25]:
pipe = Pipe(data=data, pipeline=pipeline, run_test_cases=True)

In [26]:
res = pipe.run()

ColProcessor (agg_height_first, agg_height_second) test cases PASSED! 😎
ColProcessor (agg_reach_first, agg_reach_second) test cases PASSED! 😎
ColProcessor (second_total_str, first_total_str) test cases PASSED! 😎
ColProcessor (agg_dob_first, agg_dob_second, date_card) test cases NOT FOUND.
ColProcessor (agg_weight_first)             test cases NOT FOUND.
Multi test not implemented yet
transformer test not implemented yet
Multi test not implemented yet
transformer test not implemented yet


# after 

In [28]:
res.weight_class_new

0        454
1        382
2         96
3        614
4         31
        ... 
5918     184
5919    1012
5920      96
5921     897
5922     382
Name: weight_class_new, Length: 5923, dtype: int64

# Before