- https://stackabuse.com/python-for-nlp-creating-tf-idf-model-from-scratch/

# TF-IDF Model from Scratch in Python

In [1]:
import nltk
import numpy as np
import random
import string

import bs4 as bs
import urllib.request
import re

raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')

article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

corpus = nltk.sent_tokenize(article_text)

for i in range(len(corpus )):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i])
    corpus [i] = re.sub(r'\s+',' ',corpus [i])

wordfreq = {}
for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

In [2]:
word_idf_values = {}
for token in most_freq:
    doc_containing_word = 0
    for document in corpus:
        if token in nltk.word_tokenize(document):
            doc_containing_word += 1
    word_idf_values[token] = np.log(len(corpus)/(1 + doc_containing_word))

In [3]:
word_tf_values = {}
for token in most_freq:
    sent_tf_vector = []
    for document in corpus:
        doc_freq = 0
        for word in nltk.word_tokenize(document):
            if token == word:
                  doc_freq += 1
        word_tf = doc_freq/len(nltk.word_tokenize(document))
        sent_tf_vector.append(word_tf)
    word_tf_values[token] = sent_tf_vector

In [4]:
tfidf_values = []
for token in word_tf_values.keys():
    tfidf_sentences = []
    for tf_sentence in word_tf_values[token]:
        tf_idf_score = tf_sentence * word_idf_values[token]
        tfidf_sentences.append(tf_idf_score)
    tfidf_values.append(tfidf_sentences)

In [5]:
tf_idf_model = np.asarray(tfidf_values)

In [6]:
tf_idf_model

array([[0.00678699, 0.        , 0.02941028, ..., 0.00945331, 0.0149826 ,
        0.02941028],
       [0.01515201, 0.        , 0.        , ..., 0.        , 0.02787398,
        0.02462202],
       [0.01718353, 0.        , 0.        , ..., 0.07180261, 0.01264448,
        0.01861549],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [7]:
tf_idf_model = np.transpose(tf_idf_model)

In [8]:
tf_idf_model

array([[0.00678699, 0.01515201, 0.01718353, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02941028, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00945331, 0.        , 0.07180261, ..., 0.        , 0.        ,
        0.        ],
       [0.0149826 , 0.02787398, 0.01264448, ..., 0.        , 0.        ,
        0.        ],
       [0.02941028, 0.02462202, 0.01861549, ..., 0.        , 0.        ,
        0.        ]])

In [9]:
import pandas as pd
pd.DataFrame(tf_idf_model, columns=most_freq)

Unnamed: 0,the,of,a,to,in,language,is,and,natural,processing,...,amounts,challenges,frequently,involve,roots,1950s,already,1950,alan,published
0,0.006787,0.015152,0.017184,0.027812,0.016044,0.055129,0.022329,0.062828,0.050698,0.025349,...,0.078668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.039107,0.134377,0.0,0.051048,0.185365,0.061788,...,0.0,0.191753,0.191753,0.191753,0.0,0.0,0.0,0.0,0.0,0.0
2,0.02941,0.0,0.0,0.0,0.069523,0.079631,0.0,0.0,0.109846,0.109846,...,0.0,0.0,0.0,0.0,0.340895,0.340895,0.0,0.0,0.0,0.0
3,0.015271,0.011364,0.038663,0.0,0.012033,0.013782,0.016747,0.031414,0.019012,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.059001,0.059001,0.059001,0.059001
4,0.01557,0.011587,0.026281,0.010634,0.0,0.014053,0.03415,0.016015,0.019385,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.014705,0.016415,0.0,0.030129,0.0,0.039815,0.0,0.0,0.054923,0.054923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.022058,0.012311,0.027923,0.0,0.052142,0.059723,0.0,0.0,0.041192,0.082384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.027382,0.030565,0.0,0.018701,0.021576,0.0,0.0,0.028164,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.026469,0.014773,0.0,0.027116,0.0,0.035834,0.0,0.0,0.0,0.049431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.009127,0.010188,0.011554,0.00935,0.05394,0.03707,0.0,0.028164,0.03409,0.017045,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
