In [97]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [98]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [99]:
df = pd.read_csv("data/bbc_text_cls.csv")

In [100]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [101]:
#word to index
#convert documents into sequences of ints / ids / indices
index = 0
word_to_index = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_ints = []
    for word in words:
        if word not in word_to_index:
            word_to_index[word] = index
            index += 1
            
            
        #save
        doc_as_ints.append(word_to_index[word])
    tokenized_docs.append(doc_as_ints)
    

In [102]:
#reverse mapping
index_to_word = {v:k for k,v in word_to_index.items()} #dict comprehension



In [103]:
index_to_word

{0: 'ad',
 1: 'sales',
 2: 'boost',
 3: 'time',
 4: 'warner',
 5: 'profit',
 6: 'quarterly',
 7: 'profits',
 8: 'at',
 9: 'us',
 10: 'media',
 11: 'giant',
 12: 'timewarner',
 13: 'jumped',
 14: '76',
 15: '%',
 16: 'to',
 17: '$',
 18: '1.13bn',
 19: '(',
 20: '£600m',
 21: ')',
 22: 'for',
 23: 'the',
 24: 'three',
 25: 'months',
 26: 'december',
 27: ',',
 28: 'from',
 29: '639m',
 30: 'year-earlier',
 31: '.',
 32: 'firm',
 33: 'which',
 34: 'is',
 35: 'now',
 36: 'one',
 37: 'of',
 38: 'biggest',
 39: 'investors',
 40: 'in',
 41: 'google',
 42: 'benefited',
 43: 'high-speed',
 44: 'internet',
 45: 'connections',
 46: 'and',
 47: 'higher',
 48: 'advert',
 49: 'said',
 50: 'fourth',
 51: 'quarter',
 52: 'rose',
 53: '2',
 54: '11.1bn',
 55: '10.9bn',
 56: 'its',
 57: 'were',
 58: 'buoyed',
 59: 'by',
 60: 'one-off',
 61: 'gains',
 62: 'offset',
 63: 'a',
 64: 'dip',
 65: 'bros',
 66: 'less',
 67: 'users',
 68: 'aol',
 69: 'on',
 70: 'friday',
 71: 'that',
 72: 'it',
 73: 'owns',
 74

In [104]:
#number of documents
N = len(df["text"])

In [105]:
#number of words 
V = len(word_to_index)

In [106]:
#instantiate tf matrix
tf = np.zeros((N,V))

In [107]:
#populate term frequency counts
for i, doc_as_ints in enumerate(tokenized_docs):
    for j in doc_as_ints:
        tf[i,j] += 1

In [108]:
#compute idf
doc_freq = np.sum(tf>0, axis=0) #document frequency shape (V,)
idf = np.log(N/doc_freq)

In [109]:
#compute tf-idf
tf_idf = tf * idf

In [110]:
np.random.seed(4)

In [111]:
#pick a random document, show the top 5 terms (tf-idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label: ", row["labels"])
print("Text: ", row["text"].split("\n", 1)[0])
print("Top 5 terms: ")

scores = tf_idf[i]
indices = (-scores).argsort() #ordering of scores

for j in indices[:5]:
    print(index_to_word[j], scores[j])

Label:  politics
Text:  Brown and Blair face new rift claims
Top 5 terms: 
brown 20.098612013542038
newspapers 17.95454547892856
aid 16.818533863733283
prime 15.395091828586086
irritation 14.02873002808079


In [112]:
#Exercise: use CountVectorizer to form the counts instead

#exercise (hard): use Scipy's csr_matrix instead
#You cannot use X[i, j] += 1 here

In [116]:
#CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

df["str_labels"] = df["labels"].astype(str)

tfidf_matrix = CountVectorizer().fit_transform(df["text"])

scores = tfidf_matrix.toarray()
    


In [119]:

for i in range(5):
    print("Label: ", df["str_labels"][i])
    print("Text: ", df["text"][i].split("\n", 1)[0])
    print("Top 5 terms: ")

    indices = (-scores[i]).argsort() #ordering of scores

    for j in indices[:5]:
        print(index_to_word[j], scores[i][j])


Label:  business
Text:  Ad sales boost Time Warner profit
Top 5 terms: 
five-second 19
adventurous 18
siblings 12
glory 11
know 10
Label:  business
Text:  Dollar gains on Greenspan speech
Top 5 terms: 
five-second 32
mcdonagh 12
siblings 11
adventurous 9
128-year 8
Label:  business
Text:  Yukos unit buyer faces loan claim
Top 5 terms: 
five-second 16
adventurous 13
siblings 9
blanket 7
klerck 6
Label:  business
Text:  High fuel prices hit BA's profits
Top 5 terms: 
adventurous 14
five-second 13
hiring 11
siblings 11
mcdonagh 10
Label:  business
Text:  Pernod takeover talk lifts Domecq
Top 5 terms: 
five-second 12
cinemagoers 9
siblings 9
adventurous 7
mcdonagh 6


In [148]:
#use scipy's csr_matrix
from scipy.sparse import csr_matrix

tfidf_matrix = csr_matrix((N,V))

for i, doc_as_ints in enumerate(tokenized_docs):
    for j in doc_as_ints:
        tfidf_matrix[i,j] += 1
        
scores = tfidf_matrix.toarray()

for i in range(5):
    print("Label: ", df["str_labels"][i])
    print("Text: ", df["text"][i].split("\n", 1)[0])
    print("Top 5 terms: ")

    indices = (-scores[i]).argsort() #ordering of scores

    for j in indices[:5]:
        print(index_to_word[j], scores[i][j])

