In [3]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [5]:
train_text = ['Data mining is the process of discovering patterns in large data sets', 
              'involving methods at the intersection of machine learning, statistics, and database systems.',
              'Data mining is an interdisciplinary subfield of computer science and statistics with an overall goal', 
              'to extract information (with intelligent methods) from a dataset', 
              'transform the information into a comprehensible structure for further use.' ,
              'Data mining is the analysis step of the "knowledge discovery in databases" process or KDD.',
              'Aside from the raw analysis step, it also involves database and data management aspects',
              'data pre-processing, model and inference considerations, interestingness metrics, ', 
              'complexity considerations, post-processing of discovered structures, visualization, and online updating.']

In [36]:
# stopwords.words('english') returns list of stopwords in English
count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'))

In [37]:
count_vectorizer.fit(train_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
count_vectorizer.get_feature_names()

['also',
 'analysis',
 'aside',
 'aspects',
 'complexity',
 'comprehensible',
 'computer',
 'considerations',
 'data',
 'database',
 'databases',
 'dataset',
 'discovered',
 'discovering',
 'discovery',
 'extract',
 'goal',
 'inference',
 'information',
 'intelligent',
 'interdisciplinary',
 'interestingness',
 'intersection',
 'involves',
 'involving',
 'kdd',
 'knowledge',
 'large',
 'learning',
 'machine',
 'management',
 'methods',
 'metrics',
 'mining',
 'model',
 'online',
 'overall',
 'patterns',
 'post',
 'pre',
 'process',
 'processing',
 'raw',
 'science',
 'sets',
 'statistics',
 'step',
 'structure',
 'structures',
 'subfield',
 'systems',
 'transform',
 'updating',
 'use',
 'visualization']

In [35]:
count_vectorizer.vocabulary_

{'data': 8,
 'mining': 33,
 'process': 40,
 'discovering': 13,
 'patterns': 37,
 'large': 27,
 'sets': 44,
 'involving': 24,
 'methods': 31,
 'intersection': 22,
 'machine': 29,
 'learning': 28,
 'statistics': 45,
 'database': 9,
 'systems': 50,
 'interdisciplinary': 20,
 'subfield': 49,
 'computer': 6,
 'science': 43,
 'overall': 36,
 'goal': 16,
 'extract': 15,
 'information': 18,
 'intelligent': 19,
 'dataset': 11,
 'transform': 51,
 'comprehensible': 5,
 'structure': 47,
 'use': 53,
 'analysis': 1,
 'step': 46,
 'knowledge': 26,
 'discovery': 14,
 'databases': 10,
 'kdd': 25,
 'aside': 2,
 'raw': 42,
 'also': 0,
 'involves': 23,
 'management': 30,
 'aspects': 3,
 'pre': 39,
 'processing': 41,
 'model': 34,
 'inference': 17,
 'considerations': 7,
 'interestingness': 21,
 'metrics': 32,
 'complexity': 4,
 'post': 38,
 'discovered': 12,
 'structures': 48,
 'visualization': 54,
 'online': 35,
 'updating': 52}

In [29]:
# it returns the ID for specific word.
count_vectorizer.vocabulary_.get('databases')

10

In [40]:
# It creates feature vectors from the training text
transformed_vector = count_vectorizer.transform(train_text)

In [43]:
# there are 9 sentences and 55 unique words in our vocabulary
transformed_vector.shape

(9, 55)

In [45]:
# converts to dense vector array (was sparse)
transformed_vector.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 

In [46]:
# re-generate sentence from transformed vector
count_vectorizer.inverse_transform(transformed_vector)

[array(['data', 'discovering', 'large', 'mining', 'patterns', 'process',
        'sets'], dtype='<U17'),
 array(['database', 'intersection', 'involving', 'learning', 'machine',
        'methods', 'statistics', 'systems'], dtype='<U17'),
 array(['computer', 'data', 'goal', 'interdisciplinary', 'mining',
        'overall', 'science', 'statistics', 'subfield'], dtype='<U17'),
 array(['dataset', 'extract', 'information', 'intelligent', 'methods'],
       dtype='<U17'),
 array(['comprehensible', 'information', 'structure', 'transform', 'use'],
       dtype='<U17'),
 array(['analysis', 'data', 'databases', 'discovery', 'kdd', 'knowledge',
        'mining', 'process', 'step'], dtype='<U17'),
 array(['also', 'analysis', 'aside', 'aspects', 'data', 'database',
        'involves', 'management', 'raw', 'step'], dtype='<U17'),
 array(['considerations', 'data', 'inference', 'interestingness',
        'metrics', 'model', 'pre', 'processing'], dtype='<U17'),
 array(['complexity', 'considerations', 'd

In [50]:
test_text = ['this is just a sample text for testing test_text database data']
count_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])