In [119]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [120]:
from dukto.pipe import Pipe
from dukto.processor import ColProcessor, MultiColProcessor, Transformer
import pandas as pd
import numpy as np

In [121]:
data  = pd.read_csv('data/students.csv', index_col=0)

## ColProcessor
### applys function/s to a column/s  

In [122]:
def grade_prod_mapper(g):
    return {'Freshman':"9th",'Sophomore':"10th",'Junior':'11th','Senior':"12th"}[g]

# ColProcessor 
pipeline = [
    ColProcessor('chem_grade',funcs=lambda x:(int(x.split('/')[0])/60)*100, ),
    ColProcessor(['phy_grade', 'bio_grade'], funcs=lambda x:int(x)),
    ColProcessor('age', funcs=lambda x:int(x[:-1])/12 if 'm' in x else int(x)),
    ColProcessor('height',funcs=lambda x:float(x[:-2])*2.54),
    ColProcessor('grade',funcs=[grade_prod_mapper, lambda x:int(x[:-2])], suffix='_new')
]

## MultiColProcessor

## applies a function that takes and returns a dataframe
## this class is used to add columns based on other column/s

In [123]:
def avg_grade(df):
    df['avg_grade'] = df[['chem_grade','phy_grade','bio_grade']].mean(axis=1)
    return df

multi_pipe = [
    MultiColProcessor(funcs=[avg_grade], funcs_test={})
] 

## Transformer

### applies a feature_engine style transformer to a column/s

In [124]:
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.imputation import MeanMedianImputer
# add school name so you can encode it

transformers = [
    Transformer(name=['class'], transformers=[CountFrequencyEncoder]),
    Transformer(name=['class', 'age'], transformers=[MeanMedianImputer])
    
    
]

In [130]:
# add all pipelines
all_pipes = pipeline + multi_pipe + transformers

In [131]:
pipe = Pipe(data=data, pipeline=all_pipes)

In [133]:
pipe.run().head()

Unnamed: 0,name,chem_grade,phy_grade,bio_grade,age,height,grade,class,grade_new,avg_grade
0,Jalyiah Darcey,43.333333,77,17,17.0,396.24,Freshman,59,9,45.777778
1,Eunita Beahm,18.333333,56,67,15.333333,416.56,Freshman,140,9,47.111111
2,Guluzar Bernand,70.0,97,65,18.0,398.78,Freshman,59,9,77.333333
3,Jonatham Mcnicoll,95.0,68,92,17.0,378.46,Freshman,87,9,85.0
4,Greison Hisrich,93.333333,96,49,14.0,414.02,Freshman,140,9,79.444444


In [135]:
data.head()

Unnamed: 0,name,chem_grade,phy_grade,bio_grade,age,height,grade,class
0,Jalyiah Darcey,26/60,77,17,17,156cm,Freshman,A
1,Eunita Beahm,11/60,56,67,184m,164cm,Freshman,C
2,Guluzar Bernand,42/60,97,65,18,157cm,Freshman,A
3,Jonatham Mcnicoll,57/60,68,92,17,149cm,Freshman,B
4,Greison Hisrich,56/60,96,49,14,163cm,Freshman,C
