## Adding categories to dataframe

In [1]:
import numpy as np
import pandas as pd

from collections import OrderedDict
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_object_dtype, is_categorical_dtype

from typing import Optional, List, Callable
from dataworks.df_utils import add_datefields

from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [2]:
DATAPATH = '../../dataworks/tests/testdata/'

In [3]:
df = pd.read_csv(f'{DATAPATH}testdata.csv', parse_dates=['date'], index_col=0)

In [4]:
df.head()

Unnamed: 0,date,level,setting,health,xp
0,2002-05-23,Critical,stun,12.0,755.0
1,2016-03-05,Critical,,,620.0
2,2004-01-11,Critical,proximity blast,61.0,470.0
3,2011-01-02,Medium,proximity blast,30.0,133.0
4,2008-08-08,Critical,expanding energy pulse,42.0,564.0


### Fill health with mean

In [5]:
def replace_numeric_nulls(df: pd.DataFrame, columns: Optional[List[str]]=None, function: Callable=np.median, inplace:bool=False) -> pd.DataFrame:
    """ Replace nulls in all numerical column with the median (default) or another callable function
        that works on NumPy arrays
    """
    if columns is None:
        columns = [colname for colname, column in df.items() if is_numeric_dtype(column)]
    
    if inplace:
        resdf = df
    else:
        resdf = df.copy(deep=True)
    
    fillers = OrderedDict()
    
    for column in columns:
        values = resdf[resdf[column].notnull()][column].values
        fillers[column] = function(values)
    
    resdf.fillna(value=fillers, inplace=True)
    return resdf

In [6]:
df_proc = replace_numeric_nulls(df)

In [7]:
df_proc.head()

Unnamed: 0,date,level,setting,health,xp
0,2002-05-23,Critical,stun,12.0,755.0
1,2016-03-05,Critical,,49.0,620.0
2,2004-01-11,Critical,proximity blast,61.0,470.0
3,2011-01-02,Medium,proximity blast,30.0,133.0
4,2008-08-08,Critical,expanding energy pulse,42.0,564.0


### Work with string nans

In [8]:
def object_nan_to_empty(df: pd.DataFrame, inplace: bool=False) -> pd.DataFrame:
    """"""
    columns = [colname for colname, column in df.items() if is_object_dtype(column)]
    fillers = {c:"" for c in columns}
    
    if inplace:
        resdf = df
    else:
        resdf = df.copy(deep=True)
    
    resdf.fillna(value=fillers, inplace=True)
    return resdf

In [9]:
df_proc = object_nan_to_empty(df_proc)

In [10]:
df_proc.head()

Unnamed: 0,date,level,setting,health,xp
0,2002-05-23,Critical,stun,12.0,755.0
1,2016-03-05,Critical,,49.0,620.0
2,2004-01-11,Critical,proximity blast,61.0,470.0
3,2011-01-02,Medium,proximity blast,30.0,133.0
4,2008-08-08,Critical,expanding energy pulse,42.0,564.0


In [11]:
#[colname for colname, column in df_proc.items() if is_object_dtype(column)]

In [12]:
#df_proc['setting'].isnull()

In [13]:
# https://github.com/fastai/fastai/blob/c655762c3dc835ea61ad9143d84f1c3b47fe60f4/old/fastai/structured.py#L128
# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
# https://chrisalbon.com/machine_learning/preprocessing_structured_data/convert_pandas_categorical_column_into_integers_for_scikit-learn/
#df['level'].astype('category').cat

### Categorical columns

In [14]:
def categorical_columns(df: pd.DataFrame, columns: Optional[List[str]]=None, inplace:bool=False) -> pd.DataFrame:
    """
    """
    if columns is None:
        columns = [colname for colname, column in df.items() if is_object_dtype(column)]
    
    if inplace:
        resdf = df
    else:
        resdf = df.copy(deep=True)
    
    for column in columns:
        resdf[column] = df[column].astype('category')
        
    return resdf

In [15]:
df_cat = categorical_columns(df_proc)

In [16]:
df_cat.level.cat.codes

0     1
1     1
2     1
3     4
4     1
     ..
95    2
96    0
97    1
98    2
99    4
Length: 100, dtype: int8

In [17]:
def apply_categories(df: pd.DataFrame, columns: Optional[List[str]]=None, inplace: bool=False, drop: bool=False):
    """ Executive summary
    """
    if columns is None:
        columns = [colname for colname, column in df.items() if is_categorical_dtype(column)]
    
    if inplace:
        resdf = df
    else:
        resdf = df.copy(deep=True)
        
    for column in columns:
        catcol = f'{column}_cat'
        resdf[catcol] = resdf[column].cat.codes
        
    if drop:
        resdf.drop(columns=columns, inplace=True)
        
    return resdf

In [18]:
df_codes = apply_categories(df_cat, drop=True)

In [19]:
df_codes.head()

Unnamed: 0,date,health,xp,level_cat,setting_cat
0,2002-05-23,12.0,755.0,1,7
1,2016-03-05,49.0,620.0,1,0
2,2004-01-11,61.0,470.0,1,6
3,2011-01-02,30.0,133.0,4,6
4,2008-08-08,42.0,564.0,1,2


### Convert dates

In [20]:
if 'date' in df_codes.columns:
    df_codes = add_datefields(df_codes, 'date', drop_original=True)

In [21]:
df_codes.head()

Unnamed: 0,health,xp,level_cat,setting_cat,date_dayofweek,...,date_is_month_start,date_is_quarter_end,date_is_quarter_start,date_quarter,date_week
0,12.0,755.0,1,7,3,...,False,False,False,2,21
1,49.0,620.0,1,0,5,...,False,False,False,1,9
2,61.0,470.0,1,6,6,...,False,False,False,1,2
3,30.0,133.0,4,6,6,...,False,False,False,1,52
4,42.0,564.0,1,2,4,...,False,False,False,3,32


### Fit model to check if it works

In [22]:
y = np.arange(len(df_codes))

In [23]:
model = RandomForestRegressor(n_estimators=2, n_jobs=1, max_depth=3)

In [24]:
%time model.fit(df_codes, y)

Wall time: 6.7 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

### Test entire pipeline

In [25]:
df_org = pd.read_csv(f'{DATAPATH}testdata.csv', parse_dates=['date'], index_col=0)

In [26]:
df_test = replace_numeric_nulls(df_org)
df_test = object_nan_to_empty(df_test)
df_test = categorical_columns(df_test)
df_test = apply_categories(df_test, drop=True)
df_test = add_datefields(df_test, 'date', drop_original=True)
df_test.head()

Unnamed: 0,health,xp,level_cat,setting_cat,date_dayofweek,...,date_is_month_start,date_is_quarter_end,date_is_quarter_start,date_quarter,date_week
0,12.0,755.0,1,7,3,...,False,False,False,2,21
1,49.0,620.0,1,0,5,...,False,False,False,1,9
2,61.0,470.0,1,6,6,...,False,False,False,1,2
3,30.0,133.0,4,6,6,...,False,False,False,1,52
4,42.0,564.0,1,2,4,...,False,False,False,3,32


In [27]:
%time model.fit(df_test, y)

Wall time: 11.1 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)