In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('articles.csv')

In [3]:
data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [77]:
from nltk.stem.snowball import SnowballStemmer
import re

def tokenizer(text):
    tokens = [];
    stemmer = SnowballStemmer('english')
    
    sentences = text.split('.')
    for sentence in sentences:
        for word in sentence.split(' '):
            if len(word) > 0 and re.search("^[a-zA-Z]+$", word):
                tokens.append(stemmer.stem(word))
                
    return tokens

In [78]:
X = data.iloc[:, 1]
y = data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_df=0.9, min_df=5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [79]:
X_train_tfidf.shape

(1557, 4983)

In [80]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [81]:
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

MultinomialNB()

In [82]:
mnb.score(X_test_tfidf, y_test)

0.9610778443113772

In [83]:
words = tfidf.get_feature_names_out()
for word in words:
    print(word)

aaa
aaron
abandon
abba
abbott
abc
abid
abil
abl
abn
abolish
abov
abroad
absenc
absent
absolut
absorb
abus
ac
academ
academi
acceler
accept
access
accid
accident
acclaim
accolad
accommod
accompani
accomplish
accord
account
accumul
accur
accuraci
accus
achiev
acknowledg
acquir
acquisit
act
action
activ
activist
actor
actress
actual
ad
adam
adapt
add
addict
addit
address
adequ
adjust
administr
admir
admiss
admit
adopt
adrian
adsl
adult
advanc
advantag
adventur
advers
advert
advertis
advic
advis
advisor
advisori
advoc
aesthet
affair
affect
affili
afford
afghanistan
afraid
africa
african
aftermath
afternoon
afterward
agassi
age
agenc
agenda
agent
aggress
ago
agre
agreement
agricultur
ahead
ai
aid
aim
air
airbus
aircraft
airlin
airport
airway
aka
al
alan
alarm
alastair
albeit
albert
album
alcohol
alert
alex
alexand
ali
alicia
alien
alik
aliv
allan
alleg
allegi
allen
alli
allianc
alloc
allow
alon
alongsid
alreadi
alter
altern
altogeth
alway
amaz
amazon
ambassador
ambit
ambiti
amend
america
am

In [86]:
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd

In [228]:
svd = TruncatedSVD(n_components=100)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

[u, s, v_t] = randomized_svd(X_train_tfidf, n_components=1, random_state=0)

In [229]:
zipped_topics = sorted(zip(tfidf.get_feature_names_out(), v_t[0]), key=lambda x: x[1], reverse=True)
for word in zipped_topics:
    if word[1] >= 0.1:
        print(f'{word[0]}: {word[1]}')

said: 0.22600846191003465
mr: 0.19415505690441606
year: 0.12007912716800428
game: 0.11589111992653194
peopl: 0.10579265617951397
labour: 0.10183595692028723


In [230]:
from sklearn.svm import SVC
svc = SVC(probability=True)
svc.fit(X_train_svd, y_train)
svc.score(X_train_svd, y_train)

0.9910083493898523

In [231]:
svc.score(X_test_svd, y_test)

0.9760479041916168

In [232]:
f = open('./test_text.txt','r')

In [233]:
text = f.read()
text_tfidf = tfidf.transform([text])

In [234]:
text_svd = svd.transform(text_tfidf)

In [235]:
predictions = zip(svc.classes_, svc.predict_proba(text_svd)[0])
for pred in predictions:
    print(f'{pred[0]}: {pred[1]}')

business: 0.0327405270530154
entertainment: 0.08627275455113291
politics: 0.8719791358769482
sport: 0.002459134322500363
tech: 0.006548448196403136


In [236]:
svc.predict(text_svd)

array(['politics'], dtype=object)

In [249]:
query = "cyber security"
query_tfidf = tfidf.transform([query])
query_svd = svd.transform(query_tfidf)

In [250]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
i = np.argmax(cosine_similarity(query_tfidf, X_train_tfidf, dense_output=True))
print(data.iloc[i, 1])

the pirates with no profit motive two men who were part of a huge network of internet software pirates  known as drink or die  have been convicted at the old bailey. bbc news investigates how the network worked and what motivated those involved.  they called themselves drink or die (dod). they were a network of computer buffs who derived pleasure from cracking codes protecting copyrighted software such as windows 95. they would then share it with each other. there is no suggestion any of them profited financially. but the authorities in both britain and the united states considered it software piracy and took a dim view of networks such as dod  one of a number of so-called warez organisations operating on the internet. in october 2000 the us customs service began an investigation into dod and other networks  such as razor 1911  risciso  myth and popz.  fourteen months later us customs co-ordinated a series of raids across the globe as part of operation buccaneer. seventy search warrant