# TFIDF

## With NLTK

In [33]:
from nltk.text import TextCollection
import nltk
import string

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)


def vectorize(corpus):
    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)
    
    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }

corpus = [
    'The elephant sneezed at the sight of potatoes.',
    'Bats can see via echolocation. See the bat sight sneeze!',
    'Wondering, she opened the door to the studio.'
]

vectors = list(vectorize(corpus))
vectors

[{'the': 0.0,
  'eleph': 0.13732653608351372,
  'sneez': 0.05068313851352055,
  'at': 0.13732653608351372,
  'sight': 0.05068313851352055,
  'of': 0.13732653608351372,
  'potato': 0.13732653608351372},
 {'bat': 0.21972245773362198,
  'can': 0.10986122886681099,
  'see': 0.21972245773362198,
  'via': 0.10986122886681099,
  'echoloc': 0.10986122886681099,
  'the': 0.0,
  'sight': 0.04054651081081644,
  'sneez': 0.04054651081081644},
 {'wonder': 0.13732653608351372,
  'she': 0.13732653608351372,
  'open': 0.13732653608351372,
  'the': 0.0,
  'door': 0.13732653608351372,
  'to': 0.13732653608351372,
  'studio': 0.13732653608351372}]

## With Scikit-Learn

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
transformed_corpus = tfidf.fit_transform(corpus)
transformed_corpus.toarray()

array([[0.37867627, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37867627, 0.37867627, 0.        , 0.37867627,
        0.        , 0.        , 0.28799306, 0.        , 0.37867627,
        0.        , 0.44730461, 0.        , 0.        , 0.        ],
       [0.        , 0.30251368, 0.30251368, 0.30251368, 0.        ,
        0.30251368, 0.        , 0.        , 0.        , 0.        ,
        0.60502736, 0.        , 0.23006945, 0.30251368, 0.        ,
        0.        , 0.17866945, 0.        , 0.30251368, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.36772387,
        0.        , 0.        , 0.        , 0.36772387, 0.        ,
        0.        , 0.36772387, 0.        , 0.        , 0.        ,
        0.36772387, 0.43436728, 0.36772387, 0.        , 0.36772387]])

## With Gensim

In [29]:
import gensim

corpus = [list(tokenize(doc)) for doc in corpus]
lexicon = gensim.corpora.Dictionary(corpus)
tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)
vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]
vectors

[[(0, 0.4837965208957426),
  (1, 0.4837965208957426),
  (2, 0.4837965208957426),
  (3, 0.4837965208957426),
  (4, 0.17855490118826325),
  (5, 0.17855490118826325)],
 [(4, 0.10992597952954358),
  (5, 0.10992597952954358),
  (7, 0.5956913654963344),
  (8, 0.2978456827481672),
  (9, 0.2978456827481672),
  (10, 0.5956913654963344),
  (11, 0.2978456827481672)],
 [(12, 0.408248290463863),
  (13, 0.408248290463863),
  (14, 0.408248290463863),
  (15, 0.408248290463863),
  (16, 0.408248290463863),
  (17, 0.408248290463863)]]

In [31]:
lexicon.save_as_text('lexicon.txt', sort_by_word=True)
tfidf.save('tfidf.pkl')

In [32]:
lexicon = gensim.corpora.Dictionary.load_from_text('lexicon.txt')
tfidf = gensim.models.TfidfModel.load('tfidf.pkl')