In [5]:
from sklearn import cluster, datasets
import numpy as np
import json
from nltk.corpus import wordnet
from collections import defaultdict

words = [wordnet.synset(u"cat.n.01"), wordnet.synset(u"dog.n.01"),
         wordnet.synset(u"horse.n.01"), wordnet.synset(u"boat.n.01"), wordnet.synset(u"ship.n.01")]


def w2w(w1, w2):
      if w1 == w2:
        return 1
      else:
        return w1.wup_similarity(w2)


def make_data_using_wordnet(words):
    list_of_vectors = []
    for word_x in words:
        wordvector = []
        for word_y in words:
            wordvector.append(w2w(word_x, word_y))
        list_of_vectors.append(wordvector)
    data = np.array(np.array(list_of_vectors))
    labels = words
    return (data, labels)


def make_data_by_hand():
    ''' make an array of word to word path distances, thus:
         cat  dog   horse boat  ship
    cat    1    0.8   0.6   0.1   0.1
    dog    0.8  1     0.7   0.15  0.15
    horse  0.6  0.7   1     0.1   0.1
    boat   0.1  0.15  0.1   1     0.9
    ship   0.1  0.15  0.1   0.9   1
    '''

    cat = np.array([1.0, 0.8, 0.6, 0.1, 0.1])
    dog = np.array([0.8, 1.0, 0.7, 0.15, 0.15])
    horse = np.array([0.6, 0.7, 1.0, 0.1, 0.1])
    boat = np.array([0.1, 0.15, 0.1, 1.0, 0.9])
    ship = np.array([0.1, 0.15, 0.1, 0.9, 1.0])

    data = np.array([cat, dog, horse, boat, ship])
    labels = np.array(['cat', 'dog', 'horse', 'boat', 'ship'])
    return (data, labels)


def word_cluster(data, labels, k):
    k_means = cluster.KMeans(n_clusters=2)
    k_means.fit(data)
    for i, label in enumerate(labels):
        print(label, k_means.labels_[i])

    d = defaultdict(list)
    for c, l in zip(k_means.labels_, labels):
        d['cluster' + str(c)].append(l.name())
        print(json.dumps(d, indent=True))

if __name__ == "__main__":
    data, labels = make_data_using_wordnet(words)
    word_cluster(data, labels, k=2)

Synset('cat.n.01') 0
Synset('dog.n.01') 0
Synset('horse.n.01') 0
Synset('boat.n.01') 1
Synset('ship.n.01') 1
{
 "cluster0": [
  "cat.n.01"
 ]
}
{
 "cluster0": [
  "cat.n.01",
  "dog.n.01"
 ]
}
{
 "cluster0": [
  "cat.n.01",
  "dog.n.01",
  "horse.n.01"
 ]
}
{
 "cluster0": [
  "cat.n.01",
  "dog.n.01",
  "horse.n.01"
 ],
 "cluster1": [
  "boat.n.01"
 ]
}
{
 "cluster0": [
  "cat.n.01",
  "dog.n.01",
  "horse.n.01"
 ],
 "cluster1": [
  "boat.n.01",
  "ship.n.01"
 ]
}


In [10]:
from nltk.corpus import wordnet as wn

dog = wn.synsets('dog', pos=wn.NOUN)[0]
cat = wn.synsets('cat', pos=wn.NOUN)[0]
rose = wn.synsets('rose', pos=wn.NOUN)[0]
flower = wn.synsets('flower', pos=wn.NOUN)[0]

In [12]:
from nltk.corpus import wordnet_ic

brown_ic = wordnet_ic.ic('ic-brown.dat')
rose.res_similarity(flower, brown_ic)

6.0283161048744525

In [13]:
rose.res_similarity(dog, brown_ic)

2.2241504712318556

In [14]:
cat.res_similarity(dog, brown_ic)

7.911666509036577

In [21]:
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint


def word_tokenizer(text):
        #tokenizes and stems the text
        tokens = word_tokenize(text)
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
        return tokens


def cluster_sentences(sentences, nb_of_clusters=5):
        tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                        stop_words=stopwords.words('english'),
                                        max_df=0.9,
                                        min_df=0.1,
                                        lowercase=True)
        #builds a tf-idf matrix for the sentences
        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
        kmeans = KMeans(n_clusters=nb_of_clusters)
        kmeans.fit(tfidf_matrix)
        clusters = collections.defaultdict(list)
        for i, label in enumerate(kmeans.labels_):
                clusters[label].append(i)
        return dict(clusters)


if __name__ == "__main__":
        sentences = ["Nature is beautiful","I like green apples",
                "We should protect the trees","Fruit trees provide fruits",
                "Green apples are tasty"]
        nclusters= 3
        clusters = cluster_sentences(sentences, nclusters)
        for cluster in range(nclusters):
                print ("cluster ",cluster,":")
                for i,sentence in enumerate(clusters[cluster]):
                        print ("\tsentence ",i,": ",sentences[sentence])

cluster  0 :
	sentence  0 :  We should protect the trees
	sentence  1 :  Fruit trees provide fruits
cluster  1 :
	sentence  0 :  I like green apples
	sentence  1 :  Green apples are tasty
cluster  2 :
	sentence  0 :  Nature is beautiful
