In [1]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
doc1 = "This is a very good and plain paper. this is really \
good and interesting"
doc2 = "This paper is very interesting, awesome"

In [5]:
from nltk.stem import PorterStemmer, SnowballStemmer

In [3]:
def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stemmer_s = SnowballStemmer("english")
    stop_updated = stopwords.words("english") + list(punctuation) 
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2] 
    res = " ".join(final_word)
    return res

In [6]:
doc1_clean = clean_txt(doc1)
doc1_clean

'good plain paper really good interesting'

In [7]:
doc2_clean = clean_txt(doc2)
doc2_clean

'paper interesting awesome'

In [8]:
doc = pd.DataFrame([doc1_clean, doc2_clean], columns=["text"])
doc

Unnamed: 0,text
0,good plain paper really good interesting
1,paper interesting awesome


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# instantiate the count vectorizer
count_vectorizer = CountVectorizer()

In [11]:
# fit extracts the features or BOW
#transform-> computes the counts for every feature extracted
x = count_vectorizer.fit_transform(doc['text'])

In [12]:
x

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [13]:
# we need to convert to array or dense to get output instead of address
x.toarray()

array([[0, 2, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

In [14]:
x.todense()

matrix([[0, 2, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0]], dtype=int64)

In [15]:
count_vectorizer.get_feature_names()

['awesome', 'good', 'interesting', 'paper', 'plain', 'really']

In [16]:
count_vectorizer.get_stop_words()

In [17]:
count_vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [24]:
print(count_vectorizer.vocabulary_)

{'good': 1, 'plain': 4, 'paper': 3, 'really': 5, 'interesting': 2, 'awesome': 0}


In [20]:
DTM=pd.DataFrame(x.toarray(), columns =count_vectorizer.get_feature_names())
DTM

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0,2,1,1,1,1
1,1,0,1,1,0,0


In [25]:
DTM.T

Unnamed: 0,0,1
awesome,0,1
good,2,0
interesting,1,1
paper,1,1
plain,1,0
really,1,0


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf_vect = TfidfVectorizer()

In [29]:
tfidf = tfidf_vect.fit_transform(doc['text'])

In [30]:
df_tfidf = pd.DataFrame(tfidf.toarray(), columns =tfidf_vect.get_feature_names())

In [31]:
df_tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,0.755256,0.268685,0.268685,0.377628,0.377628
1,0.704909,0.0,0.501549,0.501549,0.0,0.0


#### computing ngrams:
- unigram= one word at a time
- bigram= two word at a time
- trigram= three word at a time
#### any combinations of the above can be created 
- By default the vectorizer's in sklearn count_vectorizer and TfidfVectorizer will perform on unigrams, i.e, ngram_range will be set to (1,1)
- to extract bigrams ngram_range =(2,2)
- to extract trigrams ngram_rangr = (3,3)

In [32]:
# bigram
count_vectorizer_bg = CountVectorizer(ngram_range=(2,2))
x_bg = count_vectorizer_bg.fit_transform(doc['text'])

In [33]:
count_vectorizer_bg.get_feature_names()

['good interesting',
 'good plain',
 'interesting awesome',
 'paper interesting',
 'paper really',
 'plain paper',
 'really good']

In [34]:
DTM_bg = pd.DataFrame(x_bg.toarray(), columns = count_vectorizer_bg.get_feature_names())

In [35]:
DTM_bg

Unnamed: 0,good interesting,good plain,interesting awesome,paper interesting,paper really,plain paper,really good
0,1,1,0,0,1,1,1
1,0,0,1,1,0,0,0


In [36]:
#trigram
count_vectorizer_bg1 = CountVectorizer(ngram_range=(3,3))
x_bg1 = count_vectorizer_bg1.fit_transform(doc['text'])

In [37]:
print(count_vectorizer_bg1.get_feature_names())

['good plain paper', 'paper interesting awesome', 'paper really good', 'plain paper really', 'really good interesting']


In [38]:
DTM_bg1 = pd.DataFrame(x_bg1.toarray(), columns = count_vectorizer_bg1.get_feature_names())

In [39]:
DTM_bg1

Unnamed: 0,good plain paper,paper interesting awesome,paper really good,plain paper really,really good interesting
0,1,0,1,1,1
1,0,1,0,0,0


In [41]:
#unigram bigram
count_vectorizer_u_bg = CountVectorizer(ngram_range=(1,2))
x_u_bg = count_vectorizer_u_bg.fit_transform(doc['text'])

In [42]:
DTM_u_bg = pd.DataFrame(x_u_bg.toarray(), columns = count_vectorizer_u_bg.get_feature_names())
DTM_u_bg

Unnamed: 0,awesome,good,good interesting,good plain,interesting,interesting awesome,paper,paper interesting,paper really,plain,plain paper,really,really good
0,0,2,1,1,1,0,1,0,1,1,1,1,1
1,1,0,0,0,1,1,1,1,0,0,0,0,0


In [None]:
# max values

In [45]:
# instantiate the count vectorizer with max features
# take a column sum of your DTM and get the max n features
count_vectorizer_max = CountVectorizer(max_features=3)

In [46]:
#fit- extracts the features or BOW
# transform-> computes the counts for every feature extracted
x_max = count_vectorizer_max.fit_transform(doc['text'])

In [47]:
DTM_max = pd.DataFrame(x_max.toarray(), columns = count_vectorizer_max.get_feature_names())
DTM_max

Unnamed: 0,good,interesting,paper
0,2,1,1
1,0,1,1
