### BagOfNGrams

In [3]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [4]:
train_text = ['Data mining is the process of discovering patterns in large data sets', 
              'involving methods at the intersection of machine learning, statistics, and database systems.',
              'Data mining is an interdisciplinary subfield of computer science and statistics with an overall goal', 
              'to extract information (with intelligent methods) from a dataset', 
              'transform the information into a comprehensible structure for further use.' ,
              'Data mining is the analysis step of the "knowledge discovery in databases" process or KDD.',
              'Aside from the raw analysis step, it also involves database and data management aspects',
              'data pre-processing, model and inference considerations, interestingness metrics, ', 
              'complexity considerations, post-processing of discovered structures, visualization, and online updating.']

In [5]:
# stopwords.words('english') returns list of stopwords in English
nGram_vectorizer = CountVectorizer(ngram_range=(2,2),stop_words=stopwords.words('english'))

In [6]:
nGram_vectorizer.fit_transform(train_text)

<9x59 sparse matrix of type '<class 'numpy.int64'>'
	with 62 stored elements in Compressed Sparse Row format>

In [7]:
nGram_vectorizer.get_feature_names()

['also involves',
 'analysis step',
 'aside raw',
 'complexity considerations',
 'comprehensible structure',
 'computer science',
 'considerations interestingness',
 'considerations post',
 'data management',
 'data mining',
 'data pre',
 'data sets',
 'database data',
 'database systems',
 'databases process',
 'discovered structures',
 'discovering patterns',
 'discovery databases',
 'extract information',
 'inference considerations',
 'information comprehensible',
 'information intelligent',
 'intelligent methods',
 'interdisciplinary subfield',
 'interestingness metrics',
 'intersection machine',
 'involves database',
 'involving methods',
 'knowledge discovery',
 'large data',
 'learning statistics',
 'machine learning',
 'management aspects',
 'methods dataset',
 'methods intersection',
 'mining analysis',
 'mining interdisciplinary',
 'mining process',
 'model inference',
 'online updating',
 'overall goal',
 'patterns large',
 'post processing',
 'pre processing',
 'process dis

In [8]:
nGram_vectorizer.vocabulary_

{'data mining': 9,
 'mining process': 37,
 'process discovering': 44,
 'discovering patterns': 16,
 'patterns large': 41,
 'large data': 29,
 'data sets': 11,
 'involving methods': 27,
 'methods intersection': 34,
 'intersection machine': 25,
 'machine learning': 31,
 'learning statistics': 30,
 'statistics database': 50,
 'database systems': 13,
 'mining interdisciplinary': 36,
 'interdisciplinary subfield': 23,
 'subfield computer': 56,
 'computer science': 5,
 'science statistics': 49,
 'statistics overall': 51,
 'overall goal': 40,
 'extract information': 18,
 'information intelligent': 21,
 'intelligent methods': 22,
 'methods dataset': 33,
 'transform information': 57,
 'information comprehensible': 20,
 'comprehensible structure': 4,
 'structure use': 54,
 'mining analysis': 35,
 'analysis step': 1,
 'step knowledge': 53,
 'knowledge discovery': 28,
 'discovery databases': 17,
 'databases process': 14,
 'process kdd': 45,
 'aside raw': 2,
 'raw analysis': 48,
 'step also': 52,
 

In [9]:
# It creates feature vectors from the training text
transformed_vector = nGram_vectorizer.transform(train_text)

In [10]:
# there are 9 sentences and 59 Bigram in our vocabulary
transformed_vector.shape

(9, 59)

In [11]:
# converts to dense vector array (was sparse)
transformed_vector.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 

In [12]:
# re-generate sentence from transformed vector
nGram_vectorizer.inverse_transform(transformed_vector)

[array(['data mining', 'data sets', 'discovering patterns', 'large data',
        'mining process', 'patterns large', 'process discovering'],
       dtype='<U30'),
 array(['database systems', 'intersection machine', 'involving methods',
        'learning statistics', 'machine learning', 'methods intersection',
        'statistics database'], dtype='<U30'),
 array(['computer science', 'data mining', 'interdisciplinary subfield',
        'mining interdisciplinary', 'overall goal', 'science statistics',
        'statistics overall', 'subfield computer'], dtype='<U30'),
 array(['extract information', 'information intelligent',
        'intelligent methods', 'methods dataset'], dtype='<U30'),
 array(['comprehensible structure', 'information comprehensible',
        'structure use', 'transform information'], dtype='<U30'),
 array(['analysis step', 'data mining', 'databases process',
        'discovery databases', 'knowledge discovery', 'mining analysis',
        'process kdd', 'step knowledg

In [13]:
test_text = ['this is just a sample text for testing test_text database data']
nGram_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])