In [65]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [66]:
doc1 = "This is a very good and plain paper. this is really \
good and interesting"
doc2 = "This paper is very interesting, awesome"

In [67]:
def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stop_updated = stopwords.words("english") + list(punctuation) 
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2] 
    res = " ".join(final_word)
    return res

In [68]:
doc1_clean = clean_txt(doc1)
doc1_clean

'good plain paper really good interesting'

In [69]:
doc2_clean = clean_txt(doc2)
doc2_clean

'paper interesting awesome'

In [70]:
doc = pd.DataFrame([doc1_clean, doc2_clean], columns=["text"])
doc

Unnamed: 0,text
0,good plain paper really good interesting
1,paper interesting awesome


In [71]:
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
#Instantiate the count vectorizer
count_vectorizer = CountVectorizer()

In [73]:
#fit -> extarcts the features or BOW
# transform -> computes the counts for every feature extracted
X = count_vectorizer.fit_transform(doc['text'])

In [74]:
X

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [75]:
X.toarray()

array([[0, 2, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

In [76]:
X.todense()

matrix([[0, 2, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0]], dtype=int64)

In [77]:
count_vectorizer.get_feature_names()

['awesome', 'good', 'interesting', 'paper', 'plain', 'really']

In [78]:
print(count_vectorizer.vocabulary_)

{'good': 1, 'plain': 4, 'paper': 3, 'really': 5, 'interesting': 2, 'awesome': 0}


In [79]:
DTM = pd.DataFrame(X.toarray(),columns=count_vectorizer.get_feature_names())
DTM

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0,2,1,1,1,1
1,1,0,1,1,0,0


In [80]:
TDM = DTM.T
TDM

Unnamed: 0,0,1
awesome,0,1
good,2,0
interesting,1,1
paper,1,1
plain,1,0
really,1,0


In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
tfidf_vect = TfidfVectorizer()

In [83]:
tfidf = tfidf_vect.fit_transform(doc['text'])

In [84]:
df_tfidf = pd.DataFrame(tfidf.toarray(), 
                        columns=tfidf_vect.get_feature_names())
df_tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,0.755256,0.268685,0.268685,0.377628,0.377628
1,0.704909,0.0,0.501549,0.501549,0.0,0.0


## computing ngrams:

1. unigram - one word at a time
2. bigram - two wrods at a time
3. trigram - three words

any combination of the above can be created. 

By default the vectorizer's in sklearn : CountVentorizer and TfidfVectorizer will perform on unigrams, i.e ngram_range will be set to (1,1)
to extarct bigrams ngram_range = (2,2)
to extract trigrams ngram_range = 3,3)


In [85]:
count_vectorizer_bg = CountVectorizer(ngram_range=(2,2))
X_bg = count_vectorizer_bg.fit_transform(doc['text'])

In [86]:
count_vectorizer_bg.get_feature_names()

['good interesting',
 'good plain',
 'interesting awesome',
 'paper interesting',
 'paper really',
 'plain paper',
 'really good']

In [87]:
DTM_bg = pd.DataFrame(X_bg.toarray(),
                      columns=count_vectorizer_bg.get_feature_names())
DTM_bg

Unnamed: 0,good interesting,good plain,interesting awesome,paper interesting,paper really,plain paper,really good
0,1,1,0,0,1,1,1
1,0,0,1,1,0,0,0


In [88]:
count_vectorizer_tg = CountVectorizer(ngram_range=(3,3))
X_tg = count_vectorizer_tg.fit_transform(doc['text'])

In [89]:
count_vectorizer_tg.get_feature_names()

['good plain paper',
 'paper interesting awesome',
 'paper really good',
 'plain paper really',
 'really good interesting']

In [90]:
DTM_tg = pd.DataFrame(X_tg.toarray(),
                      columns=count_vectorizer_tg.get_feature_names())
DTM_tg

Unnamed: 0,good plain paper,paper interesting awesome,paper really good,plain paper really,really good interesting
0,1,0,1,1,1
1,0,1,0,0,0


In [91]:
count_vectorizer_u_bg = CountVectorizer(ngram_range=(1,2))
X_u_bg = count_vectorizer_u_bg.fit_transform(doc['text'])

In [92]:
count_vectorizer_u_bg.get_feature_names()

['awesome',
 'good',
 'good interesting',
 'good plain',
 'interesting',
 'interesting awesome',
 'paper',
 'paper interesting',
 'paper really',
 'plain',
 'plain paper',
 'really',
 'really good']

In [93]:
DTM_u_bg = pd.DataFrame(X_u_bg.toarray(),
                      columns=count_vectorizer_u_bg.get_feature_names())
DTM_u_bg

Unnamed: 0,awesome,good,good interesting,good plain,interesting,interesting awesome,paper,paper interesting,paper really,plain,plain paper,really,really good
0,0,2,1,1,1,0,1,0,1,1,1,1,1
1,1,0,0,0,1,1,1,1,0,0,0,0,0


In [94]:
#Instantiate the count vectorizer with max features,
# Take a column sum of your DTM and get the max n features
count_vectorizer_max = CountVectorizer(max_features=3)

In [95]:
#fit -> extarcts the features or BOW
# transform -> computes the counts for every feature extracted
X_max = count_vectorizer_max.fit_transform(doc['text'])

In [96]:
DTM_max = pd.DataFrame(X_max.toarray(),
                   columns=count_vectorizer_max.get_feature_names())
DTM_max

Unnamed: 0,good,interesting,paper
0,2,1,1
1,0,1,1
