In [2]:
mapping = {
    'Freezing': 0,
    'Warm': 1,
    'Cold': 2,
    'Boiling Hot': 3,
    'Hot': 4,
    'Lava Hot': 5
}

In [3]:
import pandas as pd
df = pd.read_csv('../input/cat_train.csv')

In [4]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [5]:
df.loc[:, 'ord_2'] = df.ord_2.map(mapping)

  df.loc[:, 'ord_2'] = df.ord_2.map(mapping)


In [6]:
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [8]:
# Lable Encoding, i.e., converting each category to a number
from sklearn import preprocessing

df = pd.read_csv('../input/cat_train.csv')
# fill NaN values in ord_2 column
df.loc[:, 'ord_2'] = df.ord_2.fillna('NONE')

# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# fit first and then transform
lbl_enc.fit(df.ord_2)
df.loc[:, 'ord_2'] = lbl_enc.transform(df.ord_2)

  df.loc[:, 'ord_2'] = lbl_enc.transform(df.ord_2)


In [9]:
df.ord_2.value_counts()

2    142726
6    124239
1     97822
0     84790
3     67508
4     64840
5     18075
Name: ord_2, dtype: int64

In tree-based models, we can directly use:
- Decision trees
- Random forest
- Extra trees
- OR any kind of boosted trees model:
    - XGBoost
    - GBM
    - LightGBM

This type of encoding cannot be used in linear models, support vector machines or neural networks as they expect data to be normalized (or standardized).

For these types of models, we can $\textbf{binarize}$ the data.

In [11]:
import numpy as np

example = np.array(
    [
        [0,0,1],
        [1,0,0],
        [1,0,1]
    ]
)

# print size in bytes
print(example.nbytes)

72


sparse format is a representation of only storing valuable data in memory.

In [12]:
import numpy as np
from scipy import sparse

example = np.array(
    [
        [0,0,1],
        [1,0,0],
        [1,0,1]
    ]
)

sparse = sparse.csr_matrix(example)

print(sparse.data.nbytes)

32


In [13]:
print(sparse.data.nbytes + sparse.indptr.nbytes + sparse.indices.nbytes)

64


The difference in size becomes vast when we have much larger arrays, let’s say with thousands of samples and tens of thousands of features.

In [14]:
import numpy as np
from scipy import sparse

n_rows = 10000
n_cols = 100000

example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))
print(f"Size of dense array: {example.nbytes}")

sparse_example = sparse.csr_matrix(example)
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes + 
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)
print(f"Full size of sparse array: {full_size}")

Size of dense array: 8000000000
Size of sparse array: 399948688
Full size of sparse array: 599963036


One Hot Encoding: a binary encoding too in the sense that there are only two values, 0s and 1s. This is not a binary representation.

The vector size has to be same as number of categories we are looking at.

Each vector has a 1 and rest all other values are 0s.


In [16]:
example = np.array([
    [0, 0, 0, 0, 1],
    [0, 1, 0, 0, 0],
    [1, 0, 0, 0, 0]
])

print(f'size of dense array: {example.nbytes}')

sparse_example = sparse.csr_matrix(example)
print(f'size of sparse array: {sparse_example.data.nbytes}')

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes +
    sparse_example.indices.nbytes
)
print(f'full size of sparse array: {full_size}')

size of dense array: 120
size of sparse array: 24
full size of sparse array: 52


In [2]:
import numpy as np
from scipy import sparse
from sklearn import preprocessing

example = np.random.randint(1000, size=100000)

ohe = preprocessing.OneHotEncoder(sparse=False)
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

print(f"Size of dense array: {example.nbytes}")

ohe = preprocessing.OneHotEncoder(sparse=True)
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

print(f"Size of sparse array: {ohe_example.data.nbytes}")

full_size = (
    ohe_example.data.nbytes +
    ohe_example.indptr.nbytes +
    ohe_example.indices.nbytes
)
print(f"Full size of sparse array: {full_size}")



Size of dense array: 800000
Size of sparse array: 800000
Full size of sparse array: 1600004


In [3]:
import pandas as pd
df = pd.read_csv('../input/cat_train.csv')
df[df.ord_2 == 'Boiling Hot'].shape

(84790, 25)

In [6]:
df.groupby(['ord_2'])['id'].count()

ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
Warm           124239
Name: id, dtype: int64

In [7]:
df.groupby(['ord_2'])["id"].transform('count')

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

In [8]:
df['new_feature'] = (
    df.ord_1.astype(str) + '_' + df.ord_2.astype(str)
)
df.new_feature

0                 Contributor_Hot
1                Grandmaster_Warm
2                    nan_Freezing
3                 Novice_Lava Hot
4                Grandmaster_Cold
                   ...           
599995            Novice_Freezing
599996         Novice_Boiling Hot
599997       Contributor_Freezing
599998                Master_Warm
599999    Contributor_Boiling Hot
Name: new_feature, Length: 600000, dtype: object

# Note
Whenver you got categorical variables, follow these steps:
- fill the NaN values !!!
- convert them to integers by applying label encoding using LableEncoder of scikit-learn or by using a mapping dictionary.
- create on-hot encoding.
- go for modelling.

In [9]:
df.ord_2.fillna('NONE').value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [11]:
import pandas as pd
from sklearn import preprocessing

train = pd.read_csv('../input/cat_train.csv')

test = pd.read_csv('../input/cat_test.csv')

# create a fake target column for test data
# since this column doesn't exist
test.loc[:, 'target'] = -1

# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)

# make a list of features we are interested in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ['id', 'target']]

# loop over the features list
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    # note the trick here
    # since its categorical data, we fillna with a string
    # and we convert all the data to string type
    # so, no matter its int or float, its converted to string
    # int/float but categorical!!!
    temp_col = data[feat].fillna('NONE').astype(str).values
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)

# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
  data.loc[:, feat] = lbl_en

In [12]:
df.ord_4.fillna('NONE').value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

In [14]:
df.ord_4 = df.ord_4.fillna('NONE')
df.loc[df['ord_4'].value_counts()[df['ord_4']].values < 2000, 'ord_4'] = 'RARE'
df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64

In [15]:
import pandas as pd
df = pd.read_csv('../input/cat_train_folds.csv')

df.kfold.value_counts()

0    120000
1    120000
2    120000
3    120000
4    120000
Name: kfold, dtype: int64

In [16]:
# check the target distribution 
print(df[df.kfold == 0].target.value_counts())
print(df[df.kfold == 1].target.value_counts())
print(df[df.kfold == 2].target.value_counts())
print(df[df.kfold == 3].target.value_counts())
print(df[df.kfold == 4].target.value_counts())

0    97535
1    22465
Name: target, dtype: int64
0    97535
1    22465
Name: target, dtype: int64
0    97535
1    22465
Name: target, dtype: int64
0    97536
1    22464
Name: target, dtype: int64
0    97536
1    22464
Name: target, dtype: int64
