## Порядковое кодирование

In [1]:
from sklearn.preprocessing import OrdinalEncoder


data = [["US"], ["UK"], ["NZ"]]
print(data)

[['US'], ['UK'], ['NZ']]


In [2]:
# кодируем признаки
encoder = OrdinalEncoder()
result = encoder.fit_transform(data)
print(result)

[[2.]
 [1.]
 [0.]]


## One-hot кодирование

In [3]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

enc = OneHotEncoder(handle_unknown="ignore")
data = np.asarray([["US"], ["UK"], ["NZ"]])
enc.fit(data)
enc.categories_

onehotlabels = enc.transform(data).toarray()
onehotlabels

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

## Мешок слов
[Более подробную информацию смотрите здесь](https://okan.cloud/posts/2021-04-08-text-vectorization-using-python-term-document-matrix/)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("display.max_columns", 50)
pd.set_option('display.max_colwidth', 200)

In [5]:
str1 = ("Hold fast to dreams, for if dreams die, "
        "life is a broken-winged bird that cannot fly.")
str2 = "No bird soars too high if he soars with his own wings."
str3 = ("A bird does not sing because it has an "
        "answer, it sings because it has a song.")

In [6]:
# входящие данные
flits = [str1, str2, str3]

In [7]:
# для иллюстрации посмотрим набор из флитов
doc = pd.DataFrame(list(zip(flits)))
doc.rename(columns={0:"flits"}, inplace=True)
doc

Unnamed: 0,flits
0,"Hold fast to dreams, for if dreams die, life is a broken-winged bird that cannot fly."
1,No bird soars too high if he soars with his own wings.
2,"A bird does not sing because it has an answer, it sings because it has a song."


In [8]:
# создаем экземпляр класса CountVectorizer
vect = CountVectorizer(binary=True)
# обучаем и преобразовываем, т.е. получаем "мешок слов"
vects = vect.fit_transform(flits)

In [9]:
# создаем плотное представление "мешок слов"
# и матрицу терм-документ
td = pd.DataFrame(vects.todense()).iloc[:5]
td.columns = vect.get_feature_names_out()
count_vect_df = td
term_document_matrix = td.T
term_document_matrix.columns = ["flit_" + str(i) for i in range(1, 4)]

In [10]:
# плотное представление "мешок слов"
count_vect_df

Unnamed: 0,an,answer,because,bird,broken,cannot,die,does,dreams,fast,fly,for,has,he,high,his,hold,if,is,it,life,no,not,own,sing,sings,soars,song,that,to,too,winged,wings,with
0,0,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,1
2,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0


In [11]:
# матрица терм-документ
term_document_matrix

Unnamed: 0,flit_1,flit_2,flit_3
an,0,0,1
answer,0,0,1
because,0,0,1
bird,1,1,1
broken,1,0,0
cannot,1,0,0
die,1,0,0
does,0,0,1
dreams,1,0,0
fast,1,0,0


In [12]:
# сопоставим флиты и плотное представление
pd.concat([doc, count_vect_df], axis=1)

Unnamed: 0,flits,an,answer,because,bird,broken,cannot,die,does,dreams,fast,fly,for,has,he,high,his,hold,if,is,it,life,no,not,own,sing,sings,soars,song,that,to,too,winged,wings,with
0,"Hold fast to dreams, for if dreams die, life is a broken-winged bird that cannot fly.",0,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0
1,No bird soars too high if he soars with his own wings.,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,1
2,"A bird does not sing because it has an answer, it sings because it has a song.",1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0
