In [327]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [328]:
from dukto.pipe import Pipe
from dukto.processor import ColProcessor, MultiColProcessor, Transformer
import pandas as pd
import numpy as np

In [329]:
data  = pd.read_csv('data/ufc.csv', index_col=0)

## ColProcessor
### applies function/s to a column/s  

In [330]:
def convert_foot_to_cm(r):
    if isinstance(r, str) and "'" in r:
        foot, inches = r.split("'")
        inches = int(foot)*12 + int(inches.replace('"', ''))
        return inches*2.54
    return np.nan

def convert_inch_to_cm(r):
    if isinstance(r,str) and '"' in r:
        return int(r.replace('"', '')) * 2.54
    return np.nan

def num_of_num_to_perc(r):
    if isinstance(r,str) and 'of' in r:
        thr, landed = map(int, r.split('of'))
        if landed > 0:
            return thr / landed 
    return np.nan

In [336]:
single_pipe = [
    ColProcessor(name=['agg_height_first','agg_height_second'], funcs=[convert_foot_to_cm], funcs_test={"6'2\"":187.96}, suffix='_new'),
    ColProcessor(name=['agg_reach_first','agg_reach_second'], funcs=[convert_inch_to_cm], funcs_test={'70"': 177.80}, suffix='_new'),
    ColProcessor(name=['second_total_str', 'first_total_str'], funcs=[num_of_num_to_perc], suffix='_%%_new', funcs_test={'50 of 100':0.5}),
    ColProcessor(name=['agg_dob_first', 'agg_dob_second', 'date_card'], funcs=[pd.to_datetime])
]

In [341]:
def add_ages(df):
    df['first_fighter_age_new'] = df['date_card'] - df['agg_dob_first']
    df['second_fighter_age_new'] = df['date_card'] - df['agg_dob_second']
    return df

def age_in_years(df):
    df['first_fighter_age_new'] = df['first_fighter_age_new'].dt.days/365
    df['second_fighter_age_new'] = df['first_fighter_age_new'].dt.days/365
    return df

In [338]:
multi_pipe = [MultiColProcessor(funcs=[add_ages, age_in_years], funcs_test={})]

In [339]:
pipe = Pipe(data=data, pipeline=single_pipe+multi_pipe, run_test_cases=True)

In [340]:
res = pipe.run()

ColProcessor (agg_height_first, agg_height_second) test cases PASSED! 😎
ColProcessor (agg_reach_first, agg_reach_second) test cases PASSED! 😎
ColProcessor (second_total_str, first_total_str) test cases PASSED! 😎
ColProcessor (agg_dob_first, agg_dob_second, date_card) test cases NOT FOUND.
  agg_dob_first agg_dob_second agg_height_first agg_height_second  \
0    1987-07-19     1989-12-26            6' 4"             6' 4"   
1    1988-03-07     1988-12-28            5' 5"             5' 9"   

  agg_reach_first agg_reach_second       agg_recodr_first agg_recodr_second  \
0             84"              77"  Record: 26-1-0 (1 NC)    Record: 12-3-0   
1             66"              68"         Record: 21-3-0    Record: 16-4-0   

  agg_stand_first agg_stand_second  ...  type       weight_class  \
0        Orthodox         Southpaw  ...  belt  Light Heavyweight   
1        Southpaw         Orthodox  ...  belt  Women's Flyweight   

  agg_height_first_new agg_height_second_new agg_reach_firs

AttributeError: Can only use .dt accessor with datetimelike values

In [317]:
res['first_fighter_age_new'].dt.days/365

0       32.580822
1       31.945205
2       28.082192
3       28.997260
4       35.024658
          ...    
5918    27.876712
5919    27.241096
5920    31.997260
5921    32.389041
5922    23.715068
Name: first_fighter_age_new, Length: 5923, dtype: float64

In [70]:

# ColProcessor 
pipeline = [
    ColProcessor('chem_grade',
                 funcs=[lambda x:(int(x.split('/')[0])/60)*100, int], 
                 funcs_test={'26/60':43},
                drop=True),
    ColProcessor(['phy_grade', 'bio_grade'], 
                 funcs=lambda x:int(x)),
    ColProcessor('age', 
                 funcs=lambda x:int(x[:-1])/12 if 'm' in x else int(x)),
    ColProcessor('height',
                 funcs_test={'156cm': 156, ''},
                 funcs=lambda x:float(x[:-2])*2.54),
    ColProcessor('grade',
                 funcs=[grade_prod_mapper, lambda x:int(x[:-2])], 
                 suffix='_new')
]

SyntaxError: invalid syntax (<ipython-input-70-3697cfdfb3d0>, line 11)

In [47]:
data

Unnamed: 0,name,chem_grade,phy_grade,bio_grade,age,height,grade,class
0,Jalyiah Darcey,26/60,77.0,17,17,156cm,Freshman,
1,Eunita Beahm,11/60,56.0,67,184m,164cm,Freshman,C
2,Guluzar Bernand,42/60,97.0,65,18,157cm,Freshman,A
3,Jonatham Mcnicoll,57/60,68.0,92,17,149cm,Freshman,B
4,Greison Hisrich,56/60,96.0,49,14,163cm,Freshman,C
...,...,...,...,...,...,...,...,...
281,Bremen Hewatt,44/60,67.0,46,14,134cm,Freshman,B
282,Venerino Billey,16/60,63.0,60,17,130cm,Freshman,C
283,Aylmar Berken,33/60,47.0,96,18,155cm,Freshman,C
284,Lhiam Roysum,35/60,31.0,76,14,167cm,Freshman,C


## MultiColProcessor

## applies a function that takes and returns a dataframe
## this class is used to add columns based on other column/s

In [16]:
def avg_grade(df):
    df['avg_grade'] = df[['chem_grade','phy_grade','bio_grade']].mean(axis=1)
    return df

multi_pipe = [
    MultiColProcessor(funcs=[avg_grade], funcs_test={})
] 

## Transformer

### applies a feature_engine style transformer to a column/s

In [40]:
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
# add school name so you can encode it

transformers = [
    Transformer(name=['phy_grade'], transformers=[MeanMedianImputer]),
    Transformer(name=['class'], transformers=[CategoricalImputer,CountFrequencyEncoder]),
#     Transformer(name=['class'], transformers=[CountFrequencyEncoder]),
]

In [41]:
# add all pipelines
all_pipes = transformers + pipeline + multi_pipe 

In [42]:
all_pipes

[Transformer(phy_grade),
 Transformer(class),
 ColProcessor(chem_grade),
 ColProcessor(phy_grade, bio_grade),
 ColProcessor(age),
 ColProcessor(height),
 ColProcessor(grade),
 MultiColProcessor()]

In [43]:
pipe = Pipe(data=data, pipeline=all_pipes, run_test_cases=True)

## After

In [44]:
pipe.run()

ColProcessor (chem_grade)                   test cases PASSED! 😎
ColProcessor (phy_grade, bio_grade)         test cases NOT FOUND.
ColProcessor (age)                          test cases NOT FOUND.
ColProcessor (height)                       test cases NOT FOUND.
ColProcessor (grade)                        test cases NOT FOUND.


Unnamed: 0,name,chem_grade,phy_grade,bio_grade,age,height,grade,class,grade_new,avg_grade
0,Jalyiah Darcey,43,77,17,17.000000,396.24,Freshman,17,9,45.666667
1,Eunita Beahm,18,56,67,15.333333,416.56,Freshman,131,9,47.000000
2,Guluzar Bernand,70,97,65,18.000000,398.78,Freshman,53,9,77.333333
3,Jonatham Mcnicoll,95,68,92,17.000000,378.46,Freshman,85,9,85.000000
4,Greison Hisrich,93,96,49,14.000000,414.02,Freshman,131,9,79.333333
...,...,...,...,...,...,...,...,...,...,...
281,Bremen Hewatt,73,67,46,14.000000,340.36,Freshman,85,9,62.000000
282,Venerino Billey,26,63,60,17.000000,330.20,Freshman,131,9,49.666667
283,Aylmar Berken,55,47,96,18.000000,393.70,Freshman,131,9,66.000000
284,Lhiam Roysum,58,31,76,14.000000,424.18,Freshman,131,9,55.000000


## Before

In [22]:
data.head()

Unnamed: 0,name,chem_grade,phy_grade,bio_grade,age,height,grade,class
0,Jalyiah Darcey,26/60,77.0,17,17,156cm,Freshman,
1,Eunita Beahm,11/60,56.0,67,184m,164cm,Freshman,C
2,Guluzar Bernand,42/60,97.0,65,18,157cm,Freshman,A
3,Jonatham Mcnicoll,57/60,68.0,92,17,149cm,Freshman,B
4,Greison Hisrich,56/60,96.0,49,14,163cm,Freshman,C


In [23]:
a = None

In [24]:
a == None

True

False