In [2]:
import pandas as pd

In [2]:
df = pd.read_csv("../input/train.csv")

In [4]:
for c in df.columns:
    print(df[c].unique())

[     0      1      2 ... 299997 299998 299999]
[0 1]
[0 1]
[0 1]
['T' 'F']
['Y' 'N']
['Green' 'Blue' 'Red']
['Triangle' 'Trapezoid' 'Polygon' 'Square' 'Star' 'Circle']
['Snake' 'Hamster' 'Lion' 'Cat' 'Dog' 'Axolotl']
['Finland' 'Russia' 'Canada' 'Costa Rica' 'China' 'India']
['Bassoon' 'Piano' 'Theremin' 'Oboe']
['50f116bcf' 'b3b4d25d0' '3263bdce5' 'f12246592' '5b0f5acd5' '46cab09da'
 'be5592604' '72f8028dc' '4604905e7' 'ad95dc0ee' '2ff007c26' 'a35c346aa'
 'dbfb714a4' 'e1558b071' '39647c92a' 'ee55b9d67' '416a8f3ab' '91bde92fa'
 '3aa9329e2' '568550f04' 'caf83c0b5' '908a1b9c9' 'e844a1f66' 'dbc448931'
 '2979f0d45' '96c73114c' 'b7bb45938' '7da3e4aec' 'be4578201' 'dc07effb0'
 '88917a066' 'f2d59cf51' '5d18641ff' '9347491f2' '475e79160' 'e70a6270d'
 '5b1a9f841' 'e0efe9d20' 'd1b1709e8' '9bb7ea2da' '6fec43dd8' '0dee9b39a'
 '527ded12a' '321bf770e' 'f7821e391' '0de4acd31' '185ba0a59' '4845cc770'
 'bdb9fef4a' '3fef1a765' '2cc9e16b9' 'fd04a970f' '81f9d3312' '30a15b6bd'
 '8dcf8adfb' '83b38aa6b' '74

In [6]:
df['ord_2'].unique()

array(['Cold', 'Hot', 'Lava Hot', 'Boiling Hot', 'Freezing', 'Warm'],
      dtype=object)

#### label encoding 
Just change the category to number

In [8]:
mapping = {
    "Freezing" : 0,
    "Warm" : 1, 
    "Cold" : 2,
    "Boiling Hot" : 3,
    "Hot" : 4,
    "Lava Hot" : 5
}

In [9]:
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [12]:
# We can do this using sklearn.preprocessing.LabelEncoder

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
df = pd.read_csv("../input/train.csv")

In [16]:
# fill NaN values in ord_2 column
df.loc[:, "ord_2"] = df.loc[:, "ord_2"].fillna("NONE") 

In [17]:
# instantiate the object of class labelencoder
lbl = LabelEncoder()

In [19]:
df.loc[:, "ord_2"] = lbl.fit_transform(df.ord_2.values)

#### this can be used directly in lot's of tree based models
* Decision Trees
* Random Forest
* Extra Trees
* Or any Kind of boosted trees model
    * XgBoost
    * GBM
    * LightGBM

This can't be used directly in linear models, support vector machines or neural networks as they expect data to be normalized (or standardized)

For that we binarize the data

--------------------------------------
* Freezing      --> 0 --> 0 0 0
* Warm          --> 1 --> 0 0 1
* Cold          --> 2 --> 0 1 0
* Boiling Hot   --> 3 --> 0 1 1
* Hot           --> 4 --> 1 0 0
* Lava Hot      --> 5 --> 1 0 1

-------------------------------------------

For binarized variables It becomes easy stores in sparse format
Where it represents the data in format such way it stores where 1 is there else

#### Example
| Index      | Feature |
| ----------- | ----------- |
| 0      | Warm       |
| 1   | Hot        |
|2 | Lava Hot |

Can be represented as 

|Index | Feature_0 | Feature_1 |Feature_2 |
|-----|---------|--------|-------|
|0| 0 |0| 1|
|1| 1| 0| 0|
|2| 1| 0| 1|

In [21]:
# So our feature are stores in a matrix which has 3 row and 3 column  - 3 * 3
# each element of this matrix occupies 9 bytes. So, our total memory 
# requirement for this array is 8*3*3 = 72 bytes

In [24]:
import numpy as np

In [26]:
example = np.array([
    [0, 0, 1],
    [1, 0, 0],
    [1, 0, 1]
])

In [27]:
example.nbytes

72

In [29]:
from scipy import sparse

In [30]:
# let's convert this array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

In [32]:
sparse_example

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [35]:
sparse_example.data.nbytes

32

In [36]:
# Actually total size of the sparse csr matrix constitues sum of three values
print(
        sparse_example.data.nbytes + 
        sparse_example.indptr.nbytes + 
        sparse_example.indices.nbytes
)

64


In [37]:
# This difference doesn't look much but when we have hundreds of 
# thousands of rows this make sense
import numpy as np
from scipy import sparse

In [48]:
n_rows = 10000
n_cols = 100000

example  = np.random.binomial(1, p = 0.05, size = (n_rows, n_cols))

print(f"Size of dense array: {example.nbytes}")

sparse_example = sparse.csc_matrix(example)

print(f"size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes +
    sparse_example.indices.nbytes
)

print(f"Full size of sparse array:  {full_size}")

Size of dense array: 8000000000
size of sparse array: 399991624
Full size of sparse array:  600387440


So, the dense array will take upto 8 gb of data, The sparse array on 
the other hand, takes only 399MB of memory

In [49]:
# one hot encoding
example = np.array([
    [0, 0, 0, 0, 1, 0],
    [0, 1, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 0]
])

In [52]:
print(f"size of the dense array: {example.nbytes}")

size of the dense array: 144


In [53]:
sparse_example = sparse.csr_matrix(example)
print(f"size of the sparse array: {sparse_example.data.nbytes}")

size of the sparse array: 24


In [54]:
full_size = (
        sparse_example.data.nbytes + 
        sparse_example.indptr.nbytes + 
        sparse_example.indices.nbytes
)

In [56]:
print(f"Full size of sparse array : {full_size}")

Full size of sparse array : 52


In [57]:
from sklearn import preprocessing

In [58]:
example = np.random.randint(1000, size = 1000000)

In [59]:
ohe = preprocessing.OneHotEncoder(sparse = False)

In [60]:
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

In [61]:
print(f"size of dense array: {ohe_example.nbytes}")

size of dense array: 8000000000


In [62]:
ohe = preprocessing.OneHotEncoder(sparse=True)

In [63]:
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

In [68]:
print(f"size of sparse array: {ohe_example.data.nbytes}")

size of sparse array: 8000000


In [69]:
full_size = (
    ohe_example.data.nbytes +
    ohe_example.indptr.nbytes + ohe_example.indices.nbytes
)

In [71]:
print(f"The Full size of sparse array: {full_size}")

The Full size of sparse array: 16000004


8 GB of data was reduced to 8 MB
So, these are most popular method to convert the categorical data
* LabelEncoder
* BinaryEncoding
* OnehotEncoding

In [74]:
df.groupby(['ord_2'])["id"].count()

ord_2
0    60627
1    33768
2    99816
3    22227
4    63908
5    19654
Name: id, dtype: int64

In [3]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

In [4]:
test_data.loc[:, "target"] = -1

In [5]:
data = pd.concat([train_data, test_data]).reset_index(drop = True)

In [6]:
features = [c for c in data.columns if c not in ["id", "target"]]

In [9]:
from sklearn import preprocessing

In [10]:
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    temp_col = data[feat].fillna("NONE").astype(str).values
    
    data.loc[:, "feat"] = lbl_enc.fit_transform(temp_col)
    

In [11]:
train = data[data.target != -1].reset_index(drop = True)
test = data[data.target == -1].reset_index(drop = True)