In [1]:
import collections
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
# stop words are common words which do not add meaning to sentence
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caelyasutake/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/caelyasutake/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
def tokenizer(text):
    # transform text into array
    tokens = word_tokenize(text)

    # yields the stem
    stemmer = PorterStemmer()

    # filter out stop words
    tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
    return tokens

In [8]:
def cluster_sentences(texts, n=2):
    # filter out common words
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('english'), lowercase=True)
    # builds TF-IDF matrix for sentences
    matrix = vectorizer.fit_transform(texts)
    # fit k-means clustering model
    model = KMeans(n_clusters=2)
    model.fit(matrix)
    topics = collections.defaultdict(list)

    for index, label in enumerate(model.labels_):
        topics[label].append(index)
    return dict(topics)

In [9]:
if __name__ == '__main__':
    sentences = ["Quantum physics is quite important in science nowadays.",
                 "Software engineering is hotter and hotter topic in silicon valley",
                 "Investing in stocks and trading with them are not that easy",
                 "FOREX is the stock market fro trading currencies",
                 "Warren Buffet is famous for making good investments. He knows stock markets."]

    n_clusters = 2
    clusters = cluster_sentences(sentences, n_clusters)

    for cluster in range(n_clusters):
        print("CLUSTER ", cluster, ":")
        for i, sentence in enumerate(clusters[cluster]):
            print("\tSENTENCE ", i+1, ": ", sentences[sentence])

CLUSTER  0 :
	SENTENCE  1 :  Investing in stocks and trading with them are not that easy
	SENTENCE  2 :  FOREX is the stock market fro trading currencies
	SENTENCE  3 :  Warren Buffet is famous for making good investments. He knows stock markets.
CLUSTER  1 :
	SENTENCE  1 :  Quantum physics is quite important in science nowadays.
	SENTENCE  2 :  Software engineering is hotter and hotter topic in silicon valley


