-
Notifications
You must be signed in to change notification settings - Fork 0
/
lab3.py
49 lines (37 loc) · 1.38 KB
/
lab3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
__author__ = '315-4'
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans# zaimpotrim k-means
import sklearn.metrics as metrics # zaimoptrim libu po metrikam ka4estva
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
vect = TfidfVectorizer()
#import nltk
#nltk.download()
def tokenize(text):
stems=[]
lems=[]
tok=vect.build_tokenizer()
tokens=tok(text)
st=PorterStemmer()
lem=WordNetLemmatizer()
for token in tokens:
stems.append(st.stem(token))
lems.append(lem.lemmatize(token))
return lems
tfidf_ngrams=TfidfVectorizer(tokenizer=tokenize)
dataset=fetch_20newsgroups(categories=['alt.atheism','talk.religion.misc','sci.space']) # berem toko 3 categorii
print(len(dataset.data)) # spisok documentov
labels = dataset.target
print(dataset.target_names) # skisok kategoriy
print(len(dataset.target_names))
X=tfidf_ngrams.fit_transform(dataset.data)
#print(X) # vyvod razrez matricy s chastotoy
# sdelaem klasterizaciu kmeans
km=KMeans(n_clusters=3) # sozdaly
km.fit(X) # obuchaem
# s4itaem ka4estvo klasterizacii po koli4estvu tekstov v klastere
print labels,km.labels_
print metrics.homogeneity_score(labels,km.labels_) # toko owibka 1-go roda
print metrics.completeness_score(labels,km.labels_) #
# mozem pos4itat lemmatizaciu