In [9]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [10]:
train_text = ['Data mining is the process of discovering patterns in large data sets', 
              'involving methods at the intersection of machine learning, statistics, and database systems.',
              'Data mining is an interdisciplinary subfield of computer science and statistics with an overall goal', 
              'to extract information (with intelligent methods) from a dataset', 
              'transform the information into a comprehensible structure for further use.' ,
              'Data mining is the analysis step of the "knowledge discovery in databases" process or KDD.',
              'Aside from the raw analysis step, it also involves database and data management aspects',
              'data pre-processing, model and inference considerations, interestingness metrics, ', 
              'complexity considerations, post-processing of discovered structures, visualization, and online updating.']

In [11]:
tfidf_vectorizer = TfidfVectorizer()

In [14]:
transformed_vector = tfidf_vectorizer.fit_transform(train_text)

In [15]:
tfidf_vectorizer.vocabulary_

{'data': 11,
 'mining': 43,
 'is': 33,
 'the': 63,
 'process': 52,
 'of': 45,
 'discovering': 16,
 'patterns': 49,
 'in': 23,
 'large': 37,
 'sets': 56,
 'involving': 32,
 'methods': 41,
 'at': 6,
 'intersection': 29,
 'machine': 39,
 'learning': 38,
 'statistics': 57,
 'and': 3,
 'database': 12,
 'systems': 62,
 'an': 1,
 'interdisciplinary': 27,
 'subfield': 61,
 'computer': 9,
 'science': 55,
 'with': 69,
 'overall': 48,
 'goal': 22,
 'to': 64,
 'extract': 18,
 'information': 25,
 'intelligent': 26,
 'from': 20,
 'dataset': 14,
 'transform': 65,
 'into': 30,
 'comprehensible': 8,
 'structure': 59,
 'for': 19,
 'further': 21,
 'use': 67,
 'analysis': 2,
 'step': 58,
 'knowledge': 36,
 'discovery': 17,
 'databases': 13,
 'or': 47,
 'kdd': 35,
 'aside': 4,
 'raw': 54,
 'it': 34,
 'also': 0,
 'involves': 31,
 'management': 40,
 'aspects': 5,
 'pre': 51,
 'processing': 53,
 'model': 44,
 'inference': 24,
 'considerations': 10,
 'interestingness': 28,
 'metrics': 42,
 'complexity': 7,
 'p

In [16]:
# It creates feature vectors from the training text
transformed_vector = tfidf_vectorizer.transform(train_text)

In [19]:
# there are 9 sentences and 70 words in our vocabulary
transformed_vector.shape

(9, 70)

In [18]:
# converts to dense vector array (was sparse)
transformed_vector.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.39679111, 0.        , 0.        , 0.        ,
        0.        , 0.34266091, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28941686, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.25163961, 0.        ,
        0.        , 0.        , 0.34266091, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.25163961, 0.        ,
        0.19839556, 0.        , 0.        , 0.        , 0.34266091,
        0.        , 0.        , 0.28941686, 0.        , 0.        ,
        0.        , 0.34266091, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.19839556, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.1

In [25]:
# idf score of each individual word - frequent words have low score and rare words have hight scores
tfidf_vectorizer.idf_

array([2.60943791, 2.60943791, 2.2039728 , 1.51082562, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.2039728 , 1.51082562, 2.2039728 , 2.60943791, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.2039728 , 2.60943791, 2.60943791, 2.2039728 , 2.60943791,
       2.2039728 , 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 1.91629073, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.60943791, 2.2039728 , 2.60943791, 1.91629073, 2.60943791,
       1.51082562, 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.60943791, 2.60943791, 2.2039728 , 2.2039728 , 2.60943791,
       2.60943791, 2.60943791, 2.2039728 , 2.2039728 , 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 1.51082562, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 2.60943791, 2.2039728 ])

In [27]:
# for ex. data (1.5) is common words in out corpus and discovering (2.60) is not common word in out corpus
list(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))

[('also', 2.6094379124341005),
 ('an', 2.6094379124341005),
 ('analysis', 2.203972804325936),
 ('and', 1.5108256237659907),
 ('aside', 2.6094379124341005),
 ('aspects', 2.6094379124341005),
 ('at', 2.6094379124341005),
 ('complexity', 2.6094379124341005),
 ('comprehensible', 2.6094379124341005),
 ('computer', 2.6094379124341005),
 ('considerations', 2.203972804325936),
 ('data', 1.5108256237659907),
 ('database', 2.203972804325936),
 ('databases', 2.6094379124341005),
 ('dataset', 2.6094379124341005),
 ('discovered', 2.6094379124341005),
 ('discovering', 2.6094379124341005),
 ('discovery', 2.6094379124341005),
 ('extract', 2.6094379124341005),
 ('for', 2.6094379124341005),
 ('from', 2.203972804325936),
 ('further', 2.6094379124341005),
 ('goal', 2.6094379124341005),
 ('in', 2.203972804325936),
 ('inference', 2.6094379124341005),
 ('information', 2.203972804325936),
 ('intelligent', 2.6094379124341005),
 ('interdisciplinary', 2.6094379124341005),
 ('interestingness', 2.6094379124341005)

In [28]:
# re-generate sentence from transformed vector
# The order of words is lost!
tfidf_vectorizer.inverse_transform(transformed_vector)

[array(['the', 'sets', 'process', 'patterns', 'of', 'mining', 'large',
        'is', 'in', 'discovering', 'data'], dtype='<U17'),
 array(['the', 'systems', 'statistics', 'of', 'methods', 'machine',
        'learning', 'involving', 'intersection', 'database', 'at', 'and'],
       dtype='<U17'),
 array(['with', 'subfield', 'statistics', 'science', 'overall', 'of',
        'mining', 'is', 'interdisciplinary', 'goal', 'data', 'computer',
        'and', 'an'], dtype='<U17'),
 array(['with', 'to', 'methods', 'intelligent', 'information', 'from',
        'extract', 'dataset'], dtype='<U17'),
 array(['use', 'transform', 'the', 'structure', 'into', 'information',
        'further', 'for', 'comprehensible'], dtype='<U17'),
 array(['the', 'step', 'process', 'or', 'of', 'mining', 'knowledge', 'kdd',
        'is', 'in', 'discovery', 'databases', 'data', 'analysis'],
       dtype='<U17'),
 array(['the', 'step', 'raw', 'management', 'it', 'involves', 'from',
        'database', 'data', 'aspects', 'as