#### Third Session: BoW vs TF-IDF

In [43]:
from nltk import sent_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [11]:
text = """A major drawback of statistical methods is that they require elaborate feature engineering. 
Since the early 2010s, the field has thus largely abandoned statistical methods and shifted to neural networks for machine learning. 
Popular techniques include the use of word embeddings to capture semantic properties of words, and an increase in end-to-end learning of a higher-level task (e.g., question answering) instead of relying on a pipeline of separate intermediate tasks (e.g., part-of-speech tagging and dependency parsing).
In some areas, this shift has entailed substantial changes in how NLP systems are designed, such that deep neural network-based approaches may be viewed as a new paradigm distinct from statistical natural language processing. 
For instance, the term neural machine translation (NMT) emphasizes the fact that deep learning-based approaches to machine translation directly learn sequence-to-sequence transformations, obviating the need for intermediate steps such as word alignment and language modeling that was used in statistical machine translation (SMT). 
Latest works tend to use non-technical structure of a given task to build proper neural network
"""

##### Cleaning Text

In [12]:
sents = sent_tokenize(text)

In [13]:
sents

['A major drawback of statistical methods is that they require elaborate feature engineering.',
 'Since the early 2010s, the field has thus largely abandoned statistical methods and shifted to neural networks for machine learning.',
 'Popular techniques include the use of word embeddings to capture semantic properties of words, and an increase in end-to-end learning of a higher-level task (e.g., question answering) instead of relying on a pipeline of separate intermediate tasks (e.g., part-of-speech tagging and dependency parsing).',
 'In some areas, this shift has entailed substantial changes in how NLP systems are designed, such that deep neural network-based approaches may be viewed as a new paradigm distinct from statistical natural language processing.',
 'For instance, the term neural machine translation (NMT) emphasizes the fact that deep learning-based approaches to machine translation directly learn sequence-to-sequence transformations, obviating the need for intermediate step

In [21]:
stemmer = PorterStemmer()

In [28]:
corpus = []

In [29]:
for sent in sents:
    review = re.sub('[^a-zA_Z]', " ", sent)
    review = re.sub('\b[a-zA-Z]\b', " ", sent)
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    print(review)
    corpus.append(review)
    print('\n')

major drawback statist method requir elabor featur engineering.


sinc earli 2010s, field thu larg abandon statist method shift neural network machin learning.


popular techniqu includ use word embed captur semant properti words, increas end-to-end learn higher-level task (e.g., question answering) instead reli pipelin separ intermedi task (e.g., part-of-speech tag depend parsing).


areas, shift entail substanti chang nlp system designed, deep neural network-bas approach may view new paradigm distinct statist natur languag processing.


instance, term neural machin translat (nmt) emphas fact deep learning-bas approach machin translat directli learn sequence-to-sequ transformations, obviat need intermedi step word align languag model use statist machin translat (smt).


latest work tend use non-techn structur given task build proper neural network




In [30]:
corpus

['major drawback statist method requir elabor featur engineering.',
 'sinc earli 2010s, field thu larg abandon statist method shift neural network machin learning.',
 'popular techniqu includ use word embed captur semant properti words, increas end-to-end learn higher-level task (e.g., question answering) instead reli pipelin separ intermedi task (e.g., part-of-speech tag depend parsing).',
 'areas, shift entail substanti chang nlp system designed, deep neural network-bas approach may view new paradigm distinct statist natur languag processing.',
 'instance, term neural machin translat (nmt) emphas fact deep learning-bas approach machin translat directli learn sequence-to-sequ transformations, obviat need intermedi step word align languag model use statist machin translat (smt).',
 'latest work tend use non-techn structur given task build proper neural network']

##### Vectorization

In [35]:
cv = CountVectorizer()

In [37]:
bow = cv.fit_transform(corpus)

In [51]:
print(bow)

  (0, 40)	1
  (0, 15)	1
  (0, 73)	1
  (0, 42)	1
  (0, 64)	1
  (0, 17)	1
  (0, 24)	1
  (0, 21)	1
  (1, 73)	1
  (1, 42)	1
  (1, 70)	1
  (1, 16)	1
  (1, 0)	1
  (1, 25)	1
  (1, 84)	1
  (1, 34)	1
  (1, 1)	1
  (1, 69)	1
  (1, 47)	1
  (1, 46)	1
  (1, 39)	1
  (1, 37)	1
  (2, 58)	1
  (2, 81)	1
  (2, 28)	1
  :	:
  (4, 19)	1
  (4, 23)	1
  (4, 13)	1
  (4, 68)	1
  (4, 67)	1
  (4, 86)	1
  (4, 52)	1
  (4, 45)	1
  (4, 74)	1
  (4, 2)	1
  (4, 43)	1
  (4, 71)	1
  (5, 47)	1
  (5, 46)	1
  (5, 88)	1
  (5, 79)	1
  (5, 35)	1
  (5, 92)	1
  (5, 82)	1
  (5, 51)	1
  (5, 80)	1
  (5, 75)	1
  (5, 26)	1
  (5, 7)	1
  (5, 60)	1


In [52]:
tf = TfidfVectorizer()

In [53]:
tfidf = tf.fit_transform(corpus).toarray()

In [54]:
print(tfidf)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.37730799 0.         0.37730799
  0.         0.         0.         0.37730799 0.         0.
  0.37730799 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.37730799 0.
  0.30939795 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.37730799 0.
  0.         0.         0.         0.         0.         0.
  0.         0.22384142 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.30326846 0.30326846 0.         0.         0.        