# Catogerise data

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Set pandas defaults
# Show max 10 rows: head(5) ... tail(5)
pd.set_option('max_rows', 10) 

## Import data

In [None]:
df = pd.read_excel('data/categorical_test.xlsx')
writer = pd.ExcelWriter('data/categorical_test.xlsx')

In [None]:
df.info()

In [None]:
df

## Safe data to Excel sheets

In [None]:
df.to_excel(writer, 'sheet_name')
writer.save()

## Clean Data

### Clean missing Values

#### Get features with missing values

In [None]:
df.columns[df.isnull().sum() > 0]

#### Number of missing values

In [None]:
feature = 'loxen_b'
df.loc[:, feature].isnull().values.sum()
df.loc[:, feature].isnull().any().sum()

#### Range of other missing values

In [None]:
list_missing = ['ND', 'nan', 'NaN', 'NA', '', '?']
df.loc[:, feature].isin(list_missing).any().sum()

In [None]:
feature = 'height'
list_missing = ['ND', 'nan', 'NaN', 'NA', '', '?']
df.loc[:, feature].isin(list_missing).any().sum()

#### Get sample(s) of missing values
- decide whether to drop the sample or impute a value

In [None]:
df[df[feature].isin(list_missing)]

#### Impute manually

In [None]:
mean_height_female = pd.to_numeric(df[df['sex'] == 'F']['height'], errors='coerce').mean().round(0)
mean_height_female

In [None]:
df.iloc[159, 4] = mean_height_female

#### Sanity check

In [None]:
df.iloc[159].height

#### Impute by function (irrespective of factors)

In [None]:
def impute(df, feature, by):
    list_ = df.loc[:, feature].replace([np.nan, None, 'ND', 'nan', 'NaN', 'NA', ''], 9999).copy()
    mask_ = list_==9999
    if by == 'mean':
        list_[mask_] = list_[~mask_].mean()
    elif by == 'mode':
        list_[mask_] = list_[~mask_].mode()
    elif by in df.columns:
        list_[mask_] = df.loc[mask_, by]
    else:
        list_[mask_] = by
    return pd.Series(list_)

In [None]:
df[feature] = impute(df, feature, 'mean')

#### Sanity check

In [None]:
assert df[feature].isin(list_missing).any().sum() == 0 and df[feature].isnull().any().sum() == 0

In [None]:
feature = 'sex'
df[feature].value_counts()
df[feature].isin(['NA', 'ND', '']).any().sum(), df[feature].isnull().any().sum()

#### Set masks of missing values to amend

In [None]:
missing = pd.isnull(df.loc[:, feature])

#### Stratified masking

In [None]:
# Male vs. female height & weight distributions
male_w = df['weight'] > 68
male_h = df['height'] > 168
male = male_w & male_h
missing_male = male & missing
missing_female = ~male & missing

#### Impute missing values

In [None]:
df.loc[missing_male, feature] = 'M'
df.loc[missing_female, feature] = 'F'

#### Sanity check

In [None]:
assert df[feature].isin(list_missing).any().sum() == 0 and df[feature].isnull().any().sum() == 0
assert df[feature].value_counts().sum() == 200
df[feature].value_counts().sum()

### Clean garbage data

In [None]:
df.describe().T

In [None]:
# Collect features that do not contribute to inference and/or prediction
garbage_features = []

# Unused features
garbage_features.extend([])        # use for list of features
garbage_features.append('donor')   # use for a single feature at a time

# Drop all 'garbage' features
df_unused = df.loc[:, garbage_features]     # save unused features for backup
df.drop(garbage_features, axis=1, inplace=True)

In [None]:
df.describe().T

### Clean textual data

#### Get object types

In [None]:
df.loc[:, df.dtypes == 'object']

In [None]:
feature = 'scanner_b'
df[feature].value_counts()

In [None]:
def clean_seq(df, feature, seperator='_'):
    import re
    return [seperator.join(sorted(set(re.split(r';\s*|\.\s*|,\s*|\+\s*|\s+', str(x).strip()))))
                      for x in df.loc[:, feature]]

def clean_feature(df, feature, seperator='_'):
    return pd.Series(clean_seq(df, feature, seperator))

In [None]:
df[feature] = clean_feature(df, feature, ',')

In [None]:
df[feature].value_counts()

### Clean numerical data

In [None]:
feature = 'nad_0'
df[feature].describe()

#### Set type to float

In [None]:
df.loc[:, feature] = df.loc[:, feature][df[feature].apply(lambda x: type(x) in [int, np.int64, float, np.float64])]
df.loc[:, feature] = df.loc[:, feature].astype('f')

#### Set all 'object' features which contain more then 2 values to float 

In [None]:
def convert_to_floats(df):
    for feature in df.loc[:, df.dtypes == 'object'].columns:
        if len(df.loc[:, feature].value_counts()) > 2:
            try:
                df.loc[:, feature] = df.loc[:, feature].astype(np.float32)
            except:
                # print("{} can't be converted to float".format(feature))
                continue

### Convert to binary on arbitrary cut

In [None]:
cut = 0.5
df.loc[:, feature] = round(impute(df, feature, 'mean'), 2)
df.loc[:, feature] = np.where(df.loc[:, feature] > cut, 1, 0).astype(np.uint8)

#### Sanity check

In [None]:
df[feature].describe()
assert ~df.loc[:, feature].isnull().any(), 'Feature contains missing values'

#### Impute mean value

In [None]:
df.loc[:, feature] = impute(df, feature, 'mean').astype('f')

In [None]:
assert ~df.loc[:, feature].isnull().any(), 'Feature contains missing values'

### Convert to top-n categories
- move typos/missing values to separate dummy var

In [None]:
def top_cat(df_, feature, top=10):
    """Create dummy vars of top most-frequent labels and annotate with A-Z"""
    alphabet = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    labels = alphabet[:top]
    other = alphabet[top+1]
    top_codes = df_.groupby(feature)[feature].count().sort_values(ascending=False).head(top).index.tolist()
    map_values = {k:l for k, l in (zip(top_codes, labels))}  # [::-1]
    key_others = set(map_values.keys()) ^ (set(df_.loc[:, feature].values))
    map_others = {k:other for k in key_others}
    map_all = {**map_others, **map_values}
#     pd.get_dummies(df_, columns=[feature], drop_first=True)
    return df_.loc[:, feature].replace(map_all).astype('category')

In [None]:
feature = 'infection_b'
df[feature].unique()

In [None]:
df[feature].value_counts()

In [None]:
df[feature] = top_cat(df, feature, 5)

#### Sanity check

In [None]:
df.filter(regex=("infection.*"))
df[feature].value_counts()

### Clean binary data

In [None]:
feature = 'sex'
df[feature] = df[feature].replace({'M':0, 'F':1}).astype(np.uint8)

#### Set all 'object' features which contain 2 values to binary

In [None]:
def convert_to_boolean(df):
    for feature in df.loc[:, df.dtypes == 'object'].columns:
        if len(df.loc[:, feature].value_counts()) == 2:
            try:
                df.loc[:, feature] = df.loc[:, feature].astype(np.uint8)
            except:
                # print("{} can't be converted to boolean".format(feature))
                continue

#### Split data on arbitrary value  
e.g. 
- keep 1 and change rest to 0
- keep 0 and change rest to 1
- change below 0 => 0, above 0 => 1

In [None]:
df.info()

In [None]:
feature = 'minirin_b'
arb_value = 1
df.loc[:, feature] = np.where(df.loc[:, feature] == 1, 1, 0).astype(np.uint8)
df[feature].value_counts()

In [None]:
feature = 'loxen_b'
arb_value = 0
df.loc[:, feature] = np.where(df.loc[:, feature] != 0, 1, 0).astype(np.uint8)
df[feature].value_counts()

In [None]:
feature = 'weight_change'
arb_value = 1
df.loc[:, feature] = np.where(df.loc[:, feature] > 0, 1, 0).astype(np.uint8)
df[feature].value_counts()

## EDA

### Distribution and Frequency

In [None]:
feature, values, by, top_n = 'target_label', [1,], 'sex', 5
df[df[feature].isin(values)].groupby(by).count()

In [None]:
(df[df[feature] == value][by].value_counts()
                              .sort_values(ascending=False)
                              .nlargest(top_n))

#### Singleton distributions

In [None]:
def get_singletons(df):
    return [feature for i, feature in enumerate(df.columns) if len(df.loc[:, feature].value_counts()) < 2]

In [None]:
df[get_singletons(df)].describe()

In [None]:
# Remove single value features
df.drop(get_singletons(df), axis=1, inplace=True)

### Remove highly skewed features by threshold

In [None]:
freq_threshold = 0.10
drop = False
for feature in df.columns:
    try:
        if df.loc[:, feature].value_counts().max()/len(df) > freq_threshold:
            print('Feature {}: {}% in one value'.format(
            feature, df.loc[:, feature].value_counts().max()/len(df)*100))
            if drop:
                print('Feature {} ({}% in one value) is dropped'.format(
                feature, df.loc[:, feature].value_counts().max()/len(df)*100))
                df.drop(feature, axis=1, inplace=True)
    except:
        continue

## Binary vars

## Dummy vars

In [None]:
feature = 'monitor'
df = pd.get_dummies(df, columns=[feature], drop_first=True)
df

## Named dummies
 - collect by value
 - split to dummy vars
 - name as per category value

In [None]:
feature = 'germes'
df[feature].unique()

In [None]:
df[feature] = clean_feature(df, feature, ',')
for cat in ['1','2','3','4','5','6','7','8','ND']:
    feature_ = '{}_{}'.format(feature, cat)
    df[feature_] = np.where(df.loc[:, feature].str.contains(cat), 1, 0).astype(np.uint8)

#### Sanity check

In [None]:
df.filter(regex=("germes.*"))

#### Delete absolete feature

In [None]:
del df[feature]