## TF-IDF from Scratch
Using bbc_text_cls.csv 
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

### Hypothesis:
- Words that appear often in the documents: Term frequency high (TF)
- Relatively unique compared to whole data set: Document Frequency Low: Inverse Document Frequency high (IDF)

In [1]:
# Imports

import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [None]:
# Data for nltk tokenizer

nltk.download('punkt')

In [8]:
# Get dataframe

df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [22]:
# Populate word2idx
# convert documents into sequences of ints / ids / indices

current_idx = 0
word2idx = {}
tokenized_docs = []

for document in df['text']:
    words = word_tokenize(document.lower())
    doc_as_int = [] # Saving words as the index
    for word in words:
        if word not in word2idx:
            word2idx[word] = current_idx
            current_idx += 1
        
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)


In [11]:
# Reverse Mapping
idx2word = [word for word, idx in word2idx.items()]


In [18]:
# Number of documents
N = len(df['text'])
N

2225

In [19]:
# Number of words
V = len(word2idx)
V

34762

In [None]:
# Create term-frequency matrix
# From scratch

tf = np.zeros((N, V))
tf

In [None]:
# Populate term-frequency TF matrix

for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1
        
tf

In [30]:
# Compute IDF
document_freq = np.sum(tf > 0, axis=0) # In how many docs a word appears, shape = (V,) 
idf = np.log(N / document_freq)

In [35]:
# compute TF-IDF

tf_idf = tf * idf # Numpy perform element-wise multiplication with '*'

In [36]:
np.random.seed(123)

In [37]:
# Pick a random document, show the top 5 terms (in terms of tf_idf score)

i = np.random.choice(N)
row = df.iloc[i]

print('Label: ', row['labels'])
print('Text: ', row['text'].split("\n", 1)[0])
print('Top 5 terms: ')

scores = tf_idf[i]
indices = (-scores).argsort() # argsort return the indices instead the scores (descending order)

for j in indices[:5]:
    print(idx2word[j])


Label:  sport
Text:  Athens memories soar above lows
Top 5 terms: 
paula
athens
1500m
her
kelly


In [38]:
# Exercise: user CountVectorizer to form the counts instead
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(df['text'])




<2225x29421 sparse matrix of type '<class 'numpy.int64'>'
	with 449254 stored elements in Compressed Sparse Row format>

In [None]:
# Exercis: use Scipy's csr_matrix instead