# Text vectorization - TF-IDF

## Open files

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv')

In [3]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

# Preprocess

In [5]:
from nautilus_nlp.preprocessing.tokenizer import tokenize
from nautilus_nlp.preprocessing.lemmatization import lemmatize_english_tokens

[nltk_data] Downloading package punkt to /Users/hugo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hugo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hugo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
%%time
df['tokens'] = df.text.apply(lambda row: tokenize(row))
df['tokens'] = df.tokens.apply(lambda row: lemmatize_english_tokens(row))
df['tokens'] = df.tokens.apply(lambda row: ' '.join(row))

CPU times: user 3min 16s, sys: 15.2 s, total: 3min 31s
Wall time: 3min 50s


# Split dataframe

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train, test = train_test_split(df, test_size=0.2)

# Compute TF-IDF

In [63]:
from nautilus_nlp.preprocessing.text_vectorizers import TfidfTextVectorizer

In [64]:
tfidf = Tfidf(max_df=0.95, #ignore terms that have a document frequency strictly higher 
              min_df=0.01,
              max_features=10000,
              encoding='utf-8',
              #stop_words=SW,
              norm=None,
              #ngram_range=(1, 2))
              )

In [65]:
# uses Sci-kit learn TfidfVectorizer()
# You can pass all the arguments supported by sci-kit 
# Doc : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf.tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=10000, min_df=0.01,
        ngram_range=(1, 1), norm=None, preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [66]:
list_of_docs = list(train.tokens)

In [67]:
list_of_docs[:2]

['coach ranieri sack by valencia claudio ranieri have be sack as valencia coach just eight month after take charge at the primera liga club for the second time in -PRON- career .    the decision be take at a board meeting follow the side s surprise elimination from the uefa cup .    -PRON- understand    and -PRON- understand    that the result in the last few week have not be the most appropriate     say club president juan bautista . former assistant antonio lopez will take over as the new coach . italian ranieri take over the valencia job in june 2004 have be replace at chelsea by jose mourinho .    thing begin well but the spanish champion extend -PRON- winless streak to six after lose to race santander last weekend . that defeat be then follow by a uefa cup exit at the hand of steaua bucharest . ranieri first take charge of valencia in 1997    guide -PRON- to the king s cup and help -PRON- to qualify for the champion league . the 54-year - old then move to atletico madrid in 1999  

In [68]:
# Compute the word count vector matrix.
# will apply 
tfidf.compute_tfidf(list_of_docs)

<1780x2631 sparse matrix of type '<class 'numpy.float64'>'
	with 253961 stored elements in Compressed Sparse Row format>

In [69]:
# Get highest-weighted words
tfidf.get_top_tfidf(n=100)

{'song': 324.615,
 'wage': 316.535,
 'roddick': 293.646,
 'minimum': 249.423,
 'music': 73.292,
 'zealand': 158.674,
 'robbie': 157.772,
 'terrorist': 153.899,
 'good': 150.616,
 'gadget': 110.235,
 'game': 72.458,
 'party': 129.043,
 'increase': 126.747,
 '25': 124.598,
 'hunt': 91.696,
 'pay': 119.675,
 'muslim': 112.338,
 'student': 111.842,
 'threat': 104.862,
 'black': 74.54,
 'liverpool': 102.93,
 'child': 100.285,
 'airline': 97.48,
 'stone': 96.591,
 'mini': 84.296,
 'jamie': 94.213,
 'play': 93.087,
 'virus': 86.885,
 'film': 91.146,
 'hour': 91.019,
 'spam': 90.217,
 'halo': 88.648,
 'mobile': 88.555,
 'edward': 88.215,
 'lord': 87.918,
 'search': 69.985,
 'wale': 86.423,
 'yuko': 75.514,
 'government': 85.639,
 'serve': 82.564,
 'machine': 82.494,
 'pension': 80.303,
 'asylum': 80.242,
 'actress': 79.333,
 'that': 78.981,
 'online': 78.916,
 'council': 77.999,
 'cash': 77.595,
 'fraud': 77.403,
 'musician': 77.291,
 'actor': 76.803,
 'gaming': 76.749,
 'lee': 76.256,
 'site'

In [70]:
# Get highest-weighted words per document
for doc in list_of_docs[:10]:
    print(tfidf.get_top_tfidf_per_doc(doc,n=10))

['cup', 'coach', 'sack', 'chelsea', 'take', 'understand', 'champion', 'club', 'charge', 'qualify']
['fox', 'audience', 'show', 'rating', 'episode', 'reality', 'series', 'ms', 'new', 'season']
['member', 'share', 'qualify', 'profit', '42', 'payment', 'society', 'than', 'building', 'worth']
['inflation', 'rate', 'rise', 'move', 'growth', 'flexibility', 'percentage', 'borrowing', 'economic', 'interest']
['profit', 'drug', 'treatment', 'disappointing', '2005', 'sale', '5bn', 'development', 'fall', 'year']
['woman', 'ministry', 'employ', 'prince', 'foreign', 'say', 'minister', 'news', 'difficulty', 'acquire']
['brown', 'labour', 'mr', 'blair', 'election', 'outline', 'manifesto', 'role', 'article', 'writing']
['iraq', 'bank', 'alexander', 'foreign', 'economy', 'many', 'work', 'mr', 'private', 'people']
['win', 'liverpool', 'trophy', 'steven', 'think', 'league', 'draw', 'champion', 'only', 'side']
['rating', 'bbc', 'prove', 'network', 'series', 'lose', 'slot', 'comeback', 'desperate', 'report

# Apply Tf-idf to new documents

In [71]:
test.head()

Unnamed: 0,category,text,tokens
1992,tech,us peer-to-peer pirates convicted the first co...,-PRON- peer - to - peer pirate convict the fir...
1281,sport,hewitt falls to dent lleyton hewitt suffered a...,hewitt fall to dent lleyton hewitt suffer a sh...
844,tech,why cell will get the hard sell the world is c...,why cell will get the hard sell the world be c...
1188,entertainment,belle named best scottish band belle & sebas...,belle name good scottish band belle & se...
1573,business,karachi stocks hit historic high the karachi s...,karachi stock hit historic high the karachi st...


In [72]:
test.tokens.apply(lambda row: tfidf.get_top_tfidf_per_doc(row))

1992    [peer, copyright, piracy, network, plead, guil...
1281    [seed, hewitt, break, feel, beat, strong, four...
844     [cell, processor, technology, sony, computer, ...
1188    [scottish, band, brit, musical, list, act, awa...
1573    [market, stock, analyst, political, index, mon...
687     [film, institute, news, trend, broadcast, cite...
1830    [rule, consumer, mobile, firm, service, phone,...
1134    [net, standard, dr, address, work, challenge, ...
1015    [rugby, glasgow, edinburgh, murray, border, ro...
1665    [olympic, football, game, career, woman, final...
1648    [referee, apologise, arsenal, match, assistant...
487     [club, manager, return, at, unveil, jone, as, ...
1124    [lewis, council, labour, football, wage, gover...
1094    [gold, debt, relief, reserve, meeting, price, ...
744     [advert, drug, article, medical, employee, let...
939     [musical, theatre, stage, kelly, absolute, per...
1742    [creative, art, technology, graphic, bt, engin...
1368    [debt,

In [73]:
test_tfidf_matrix = tfidf.apply_tfidf_to_documents(list(test.tokens))

In [74]:
tfidf.get_top_tfidf(test_tfidf_matrix)

{'cell': 94.188,
 'mobile': 92.405,
 'camera': 90.674,
 'mini': 89.254,
 'china': 87.516,
 'rugby': 85.906,
 'film': 82.031,
 'bt': 76.256}