# Text vectorization - TF-IDF

## Open files

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv')

In [3]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

# Preprocess

In [5]:
from nautilus_nlp.preprocessing.tokenizer import tokenize
from nautilus_nlp.preprocessing.lemmatization import lemmatize_english_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
%%time
df['tokens'] = df.text.apply(lambda row: tokenize(row))
df['tokens'] = df.tokens.apply(lambda row: lemmatize_english_tokens(row))
df['tokens'] = df.tokens.apply(lambda row: ' '.join(row))

CPU times: user 2min 47s, sys: 1.15 s, total: 2min 48s
Wall time: 2min 48s


# Split dataframe

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train, test = train_test_split(df, test_size=0.2)

# Compute TF-IDF

In [9]:
from nautilus_nlp.preprocessing.text_vectorizers import TfidfTextVectorizer

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [15]:
tfidf = TfidfTextVectorizer(max_df=0.95, #ignore terms that have a document frequency strictly higher 
              min_df=0.01,
              max_features=10000,
              encoding='utf-8',
              #stop_words=SW,
              norm=None,
              #ngram_range=(1, 2))
              )

In [16]:
# uses Sci-kit learn TfidfVectorizer()
# You can pass all the arguments supported by sci-kit 
# Doc : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf.tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=10000, min_df=0.01,
        ngram_range=(1, 1), norm=None, preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
list_of_docs = list(train.tokens)

In [18]:
list_of_docs[:2]

['bid to cut court witness stress new target to reduce the stress to victim and witness give evidence in court in england and wale have be announce by the lord chancellor .    lord falconer want all crown court and 90 % of magistrate    court to have facility to keep witness separate from defendant within four year . more video link will also be make available so that witness do not have to enter courtroom . -PRON- be part of a five - year plan to help build confidence in the justice system .    minister say the strategy be aim at re - balance the court system towards victim    and increase the number of offender bring to justice . launch the department for constitutional affair    plan    lord falconer say :    one of the top priority will be a well deal for victim .    the need and safety of victim will be at the heart of the way trial be manage .     court    judge    magistrate    prosecutor    police and victim support - all work together to ensure the right of victim be put first

In [19]:
# Compute the word count vector matrix.
# will apply 
tfidf.compute_tfidf(list_of_docs)

<1780x2638 sparse matrix of type '<class 'numpy.float64'>'
	with 251758 stored elements in Compressed Sparse Row format>

In [20]:
# Get highest-weighted words
tfidf.get_top_tfidf(n=100)

{'song': 328.383,
 'wage': 314.316,
 'roddick': 293.646,
 'minimum': 260.403,
 'music': 75.21,
 'robbie': 157.772,
 'terrorist': 153.899,
 'good': 151.491,
 'game': 73.159,
 'increase': 129.725,
 'hunt': 93.316,
 'party': 126.188,
 '25': 125.075,
 'pay': 121.356,
 'muslim': 114.249,
 'gadget': 113.934,
 'student': 108.451,
 'threat': 104.522,
 'black': 103.206,
 'liverpool': 101.794,
 'airline': 99.171,
 'stone': 96.591,
 'mini': 90.917,
 'mobile': 89.006,
 'play': 92.565,
 'cell': 92.487,
 'virus': 87.661,
 'jamie': 92.1,
 'spam': 91.696,
 'film': 82.501,
 'hour': 91.616,
 'camera': 91.062,
 'edward': 88.856,
 'search': 71.07,
 'passenger': 88.648,
 'lord': 87.43,
 'rugby': 86.836,
 'china': 86.425,
 'yuko': 75.514,
 'government': 86.077,
 'machine': 84.396,
 'ibm': 82.338,
 'online': 80.27,
 'asylum': 80.242,
 'musician': 79.603,
 'fraud': 79.531,
 'pension': 79.157,
 'that': 78.174,
 'gaming': 78.083,
 'bt': 77.836,
 'cash': 77.796,
 'argentina': 77.567,
 'site': 77.486,
 'lee': 77.

In [21]:
# Get highest-weighted words per document
for doc in list_of_docs[:10]:
    print(tfidf.get_top_tfidf_per_doc(doc,n=10))

['court', 'lord', 'witness', 'victim', 'evidence', 'chancellor', 'rid', 'constitutional', 'house', 'justice']
['ship', 'market', 'limited', 'for', 'technical', 'one', 'stock', 'impact', 'much', 'on']
['trust', 'politician', 'voter', 'election', 'poll', 'public', 'issue', 'lib', 'dem', 'lack']
['member', 'share', 'qualify', 'profit', '42', 'payment', 'society', 'than', 'building', 'worth']
['ireland', 'cardiff', 'slam', 'england', 'grand', 'year', 'wale', 'last', 'team', 'tournament']
['good', 'theatre', 'musical', 'mary', 'royal', 'producer', 'at', 'design', 'outstanding', 'award']
['virus', 'writer', 'phone', '2004', 'that', 'attack', 'number', 'spam', 'use', 'message']
['academy', 'award', 'oscar', 'war', 'ceremony', 'frank', 'winner', 'film', 'first', 'as']
['broadcaster', 'debate', 'lord', 'prime', 'campaign', 'election', 'say', 'minister', 'blair', 'ahead']
['parliament', 'record', 'prior', 'document', 'blow', 'page', 'act', 'room', 'by', 'history']


# Apply Tf-idf to new documents

In [22]:
test.head()

Unnamed: 0,category,text,tokens
1515,business,booming markets shed few tears the market for...,boom market shed few tear the market former...
980,sport,claxton hunting first major medal british hurd...,claxton hunt first major medal british hurdler...
1634,tech,sony psp handheld console hits us the latest h...,sony psp handheld console hit -PRON- the late ...
1053,entertainment,black sabbath top rock album poll black sabbat...,black sabbath top rock album poll black sabbat...
1870,sport,jones doping probe begins an investigation int...,jone dope probe begin an investigation into do...


In [23]:
test.tokens.apply(lambda row: tfidf.get_top_tfidf_per_doc(row))

1515    [disaster, market, stock, insurance, global, b...
980     [medal, hurdle, indoor, european, training, co...
1634    [sony, device, gaming, handheld, gadget, ninte...
1053    [band, rock, black, album, list, top, live, po...
1870    [olympic, jone, truth, medal, sydney, pound, a...
1135    [store, sale, retailer, december, same, rise, ...
1034    [jackson, ms, boy, prosecution, court, film, d...
1061    [festival, sell, event, day, organiser, act, j...
205     [engine, union, buy, six, motor, plant, italia...
1500    [program, phone, malicious, file, instal, work...
2100    [uk, trick, ban, will, super, industry, respon...
1046    [round, victory, title, really, first, capture...
1728    [cardiff, thomas, jone, morgan, france, italy,...
575     [file, server, operator, peer, network, site, ...
24      [sound, audio, mobile, technology, phone, hand...
1556    [eu, law, draft, computer, software, legal, im...
2058    [interactive, award, nomination, category, tv,...
750     [card,

In [24]:
test_tfidf_matrix = tfidf.apply_tfidf_to_documents(list(test.tokens))

In [25]:
tfidf.get_top_tfidf(test_tfidf_matrix)

{'zealand': 157.747,
 'gadget': 137.671,
 'child': 100.98,
 'camera': 91.062,
 'wale': 86.221,
 'mini': 85.866,
 'bt': 77.836,
 'soul': 77.567,
 'council': 76.619,
 'jackson': 74.892}