# Video: Examples of Feature Extraction in Scikit-Learn

This video shows off the variety of feature extraction methods available in scikit-learn.

In [None]:
import numpy as np
import sklearn.feature_extraction

In [None]:
dict_data = [{"fruit": "mango", "yellow": 2, "rating": 5}, {"fruit": "pear", "green": 4, "rating": 3}]

In [None]:
dict_vectorizer = sklearn.feature_extraction.DictVectorizer()
dict_vectorizer.fit(dict_data)

In [None]:
dict_vectorizer.feature_names_

['fruit=mango', 'fruit=pear', 'green', 'rating', 'yellow']

In [None]:
dict_vectorizer.transform(dict_data)

<2x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [None]:
dict_vectorizer.transform(dict_data).todense()

matrix([[1., 0., 0., 5., 2.],
        [0., 1., 4., 3., 0.]])

In [None]:
feature_hasher = sklearn.feature_extraction.FeatureHasher(n_features=10)

In [None]:
feature_hasher.transform(dict_data).todense()

matrix([[0., 0., 5., 2., 0., 0., 0., 1., 0., 0.],
        [0., 0., 3., 0., 5., 0., 0., 0., 0., 0.]])

In [None]:
documents = []
documents.append("mangos are the best.")
documents.append("mango mango mango")
documents.append("pears are ok")
documents.append("apples are decent")

In [None]:
text_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
text_vectorizer.fit(documents)

In [None]:
text_vectorizer.get_feature_names_out()

array(['apples', 'are', 'best', 'decent', 'mango', 'mangos', 'ok',
       'pears', 'the'], dtype=object)

In [None]:
text_vectorizer.transform(documents).todense()

matrix([[0.        , 0.34578314, 0.5417361 , 0.        , 0.        ,
         0.5417361 , 0.        , 0.        , 0.5417361 ],
        [0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.41137791, 0.        , 0.        , 0.        ,
         0.        , 0.64450299, 0.64450299, 0.        ],
        [0.64450299, 0.41137791, 0.        , 0.64450299, 0.        ,
         0.        , 0.        , 0.        , 0.        ]])