In [31]:
import pandas as pd
import numpy as np
import numpy.typing as npt
import nltk
import seaborn as sns
import tqdm
import os

from nltk import word_tokenize
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from nltk.corpus import stopwords

In [3]:
from common_preprocess import *

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adhocmaster\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df = pd.read_csv("../data/bbc_text_cls.csv")
df.head()
df.iloc[0]["text"]

'Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sig

In [6]:
# step 1. lemmatize all
df["text"] = lemmatize(df["text"]) # wtf is this? It only removes white spaces?

In [7]:
df.head()
df.iloc[0]["text"]

"Ad sale boost Time Warner profit Quarterly profit at US medium giant TimeWarner jump 76 % to $ 1.13bn ( £600m ) for the three month to December , from $ 639m year-earlier . The firm , which be now one of the big investor in Google , benefit from sale of high-speed internet connection and high advert sale . TimeWarner say fourth quarter sale rise 2 % to $ 11.1bn from $ 10.9bn . Its profit be buoy by one-off gain which offset a profit dip at Warner Bros , and less user for AOL . Time Warner say on Friday that it now own 8 % of search-engine Google . But it own internet business , AOL , have have mix fortune . It lose 464,000 subscriber in the fourth quarter profit be low than in the preceding three quarter . However , the company say AOL 's underlying profit before exceptional item rise 8 % on the back of strong internet advertising revenue . It hop to increase subscriber by offer the online service free to TimeWarner internet customer and will try to sign up AOL 's exist customer for h

In [8]:
# tokens samples
words = word_tokenize(df.iloc[0]['text'])
# words

In [9]:
topWords = getTopWords(
    word_tokenize, 
    df["text"], 
    maxSize = 10,
    stopWords=stopwords.words('english')
)
topWords

say 8829
mr 2994
year 2828
would 2629
make 2209
also 2156
people 2045
new 1980
one 1807
take 1736


['say', 'mr', 'year', 'would', 'make', 'also', 'people', 'new', 'one', 'take']

In [10]:
topWords = getTopWords(
    word_tokenize, 
    df["text"], 
    maxSize = 10,
    stopWords = None
)
topWords

the 52599
be 30732
to 24984
a 22363
of 19970
and 18572
in 17626
have 12584
it 10068
for 8918


['the', 'be', 'to', 'a', 'of', 'and', 'in', 'have', 'it', 'for']

In [11]:
wordToIndex = buildWordToIndex (
    word_tokenize, 
    df["text"], 
    maxSize = 5000,
    stopWords=stopwords.words('english'),
    lowercase=True,
    
)
# wordToIndex

say 8829
mr 2994
year 2828
would 2629
make 2209
also 2156
people 2045
new 1980
one 1807
take 1736


In [12]:
idxToWord = {v: k for k, v in wordToIndex.items()}
print("most frequents:", idxToWord[0], idxToWord[1], idxToWord[2], idxToWord[3])

most frequents: say mr year would


In [13]:
os.environ["TQDM_DISABLE"]="1"
tf = getTermFreqMatrix(word_tokenize, df["text"], wordToIndex, lowercase=True)
assert tf.sum() > 0
assert tf[0, :].sum() > 0
for i in range(tf.shape[0]):
    assert tf[i, :].sum() > 0
    
mostFreqFirst = np.argmax(tf[0, :])
print(mostFreqFirst, idxToWord[mostFreqFirst], wordToIndex["n"]) # n is a word!
tf[:5, 3133:3140]


333 profit 3133


array([[0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.]])

In [20]:
tf[:, 0].sum()

8829.0

In [21]:
len(df)

2225

In [22]:
print(tf[0].sum(), len(df["text"].iloc[0]))

220.0 2513


In [44]:
def getDf(tf: npt.NDArray) -> npt.NDArray:
    exists = tf > 0
    return exists.sum(axis=0)

def getIDF(nDocs: int, docF: npt.NDArray) -> npt.NDArray:
    return np.log(nDocs / docF)

def getTFIDF(tf: npt.NDArray, idf: npt.NDArray) -> npt.NDArray:
    # each doc needs to be multiplied by idf, the broadcast should automatically work
    return tf * idf
    

In [45]:
docF = getDf(tf)
idf = getIDF(len(df), docF)
tfIdf = getTFIDF(tf, idf)

In [41]:
docF[0: 10]

array([1962,  789, 1321, 1156, 1175, 1265,  797,  982, 1042, 1026])

In [42]:
tf[:, :].sum(axis=0)[0:10]

array([8829., 2994., 2828., 2629., 2209., 2156., 2045., 1980., 1807.,
       1736.])

In [43]:
idf[0:10]

array([0.12579255, 1.03674587, 0.52136789, 0.65479115, 0.63848877,
       0.56468479, 1.02665752, 0.81792089, 0.75861497, 0.77408917])

In [46]:
tfIdf[0:10]

array([[0.62896277, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25158511, 1.03674587, 0.52136789, ..., 0.        , 0.        ,
        0.        ],
       [0.50317022, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.37737766, 2.07349175, 1.04273578, ..., 0.        , 0.        ,
        0.        ],
       [0.37737766, 0.        , 1.04273578, ..., 0.        , 0.        ,
        0.        ],
       [0.12579255, 0.        , 0.52136789, ..., 0.        , 0.        ,
        0.        ]])

In [87]:
# Exercise top terms
def topTerms(docIdx: int, tfidf: npt.NDArray, idxToWord: Dict[int, str]) -> List[str]:
    # words = word_tokenize(doc)
    row = tfidf[docIdx, :]
    # top5Idx = np.argpartition(row, -5)[-5:].tolist()
    # print(top5Idx)
    indices = (-row).argsort()
    top5Idx = indices[:5]
    topW = [idxToWord[i] for i in top5Idx]
    return topW
    
    

In [93]:
print(topTerms(0, tfIdf, idxToWord))
print(topTerms(1, tfIdf, idxToWord))

['aol', 'profit', 'warner', 'revenue', 'restate']
['deficit', 'dollar', 'greenspan', 'currency', 'chinese']


In [94]:
# do the same thing with sk learn
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
tfIdf2 = X.toarray()
idxToWord2 = {v: k for k, v in vectorizer.vocabulary_.items()}
print(topTerms(0, tfIdf2, idxToWord2))
print(topTerms(1, tfIdf2, idxToWord2))

['timewarner', 'aol', 'profit', 'warner', 'revenue']
['deficit', 'dollar', 'greenspan', 'currency', 'chinese']


In [95]:
# Exercise: use CountVectorizer to form the counts instead
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
wordFreq = X.toarray()
idxToWord2 = {v: k for k, v in vectorizer.vocabulary_.items()}
print(topTerms(0, wordFreq, idxToWord2))
print(topTerms(1, wordFreq, idxToWord2))


['profit', 'aol', 'timewarner', 'say', 'sale']
['dollar', 'deficit', 'recent', 'month', 'reserve']
