## Adding categories to dataframe

In [1]:
import numpy as np
import pandas as pd

from collections import OrderedDict
from pandas.api.types import is_numeric_dtype, is_string_dtype

from typing import Optional, List

from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [2]:
DATAPATH = '../../dataworks/tests/testdata/'

In [3]:
df = pd.read_csv(f'{DATAPATH}testdata.csv', parse_dates=['date'], index_col=0)

In [4]:
df.head()

Unnamed: 0,date,level,setting,health
0,2019-10-28,Medium,,53.0
1,2007-04-24,High,proximity blast,27.0
2,2001-03-21,Critical,expanding energy pulse,55.0
3,2019-08-03,,field burst,
4,2013-11-13,High,proximity blast,27.0


### Fill health with mean

In [5]:
mean_health = df['health'].values[df['health'].notnull()].mean()
mean_health = np.round(mean_health, 2)
print(mean_health)

50.81


In [6]:
df.fillna(value={'health': mean_health}, inplace=True)
df.head()

Unnamed: 0,date,level,setting,health
0,2019-10-28,Medium,,53.0
1,2007-04-24,High,proximity blast,27.0
2,2001-03-21,Critical,expanding energy pulse,55.0
3,2019-08-03,,field burst,50.81
4,2013-11-13,High,proximity blast,27.0


### Work with string nans

In [7]:
def str_nan_to_empty(df: pd.DataFrame, inplace: bool=False) -> pd.DataFrame:
    """"""
    columns = [colname for colname, column in df.items() if is_string_dtype(column)]
    fillers = {c:"" for c in columns}
    
    if inplace:
        resdf = df
    else:
        resdf = df.copy(deep=True)
    
    resdf.fillna(value=fillers, inplace=True)
    return resdf

In [8]:
df = str_nan_to_empty(df)
df.head()

Unnamed: 0,date,level,setting,health
0,2019-10-28,Medium,,53.0
1,2007-04-24,High,proximity blast,27.0
2,2001-03-21,Critical,expanding energy pulse,55.0
3,2019-08-03,,field burst,50.81
4,2013-11-13,High,proximity blast,27.0


In [9]:
# https://github.com/fastai/fastai/blob/c655762c3dc835ea61ad9143d84f1c3b47fe60f4/old/fastai/structured.py#L128
# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
# https://chrisalbon.com/machine_learning/preprocessing_structured_data/convert_pandas_categorical_column_into_integers_for_scikit-learn/
df['level'].astype('category').cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x0000015A0A2B02B0>

In [10]:
for colname, column in df.items():
    print('{}: is_string_dtype={}'.format(colname, is_string_dtype(column)))

date: is_string_dtype=False
level: is_string_dtype=True
setting: is_string_dtype=True
health: is_string_dtype=False


In [11]:
[colname for colname, column in df.items() if is_string_dtype(column)]

['level', 'setting']

In [12]:
def columns_as_categories(df: pd.DataFrame, columns: Optional[List[str]]=None, inplace:bool=False) -> pd.DataFrame:
    """
    """
    if columns is None:
        columns = [colname for colname, column in df.items() if is_string_dtype(column)]
    
    if inplace:
        resdf = df
    else:
        resdf = df.copy(deep=True)
    
    for column in columns:
        #resdf[column] = df[column].astype('category').cat.codes # <- Works in scikit
        resdf[column] = df[column].astype('category') # <- Doesn't work in scikit 
        
    return resdf

In [13]:
df = columns_as_categories(df)

In [14]:
df.head()

Unnamed: 0,date,level,setting,health
0,2019-10-28,Medium,,53.0
1,2007-04-24,High,proximity blast,27.0
2,2001-03-21,Critical,expanding energy pulse,55.0
3,2019-08-03,,field burst,50.81
4,2013-11-13,High,proximity blast,27.0


In [15]:
#df['level'].cat.codes

In [16]:
df[0:50]

Unnamed: 0,date,level,setting,health
0,2019-10-28,Medium,,53.00
1,2007-04-24,High,proximity blast,27.00
2,2001-03-21,Critical,expanding energy pulse,55.00
3,2019-08-03,,field burst,50.81
4,2013-11-13,High,proximity blast,27.00
...,...,...,...,...
45,2009-09-29,Low,,13.00
46,2012-01-20,Low,proximity blast,20.00
47,2003-10-26,Medium,stun,95.00
48,2007-04-20,High,disintegrate,84.00


In [17]:
X = df[['level', 'setting']]
X.head()

y = df['health'].values

In [18]:
model = RandomForestRegressor(n_estimators=10, n_jobs=-1)

In [19]:
model.fit(X, y)

ValueError: could not convert string to float: 'Medium'

In [20]:
X['setting'].cat.codes

0     0
1     6
2     2
3     3
4     6
     ..
95    6
96    6
97    7
98    7
99    5
Length: 100, dtype: int8