In [1]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bray1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bray1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bray1\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bray1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [78]:
import urllib
import json
import numpy as np
import re

from bs4 import BeautifulSoup
from collections import defaultdict
from nltk.probability import FreqDist
from heapq import nlargest
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier


In [61]:
blog_url = 'https://public-api.wordpress.com/rest/v1.1/sites/binaykumarray.wordpress.com/posts/'

def scrape_blog(url):
    response = urllib.request.urlopen(url)
    blog_data = json.loads(response.read())
    data = [{i['slug']:{'title': i['title'], 'url': i['URL'], 'content': i['content']}} for i in blog_data.get('posts')]
    content_list = [clean_html(i['content']) for i in blog_data.get('posts')]
    return data, content_list

def clean_html(html_text):
    text = BeautifulSoup(html_text, "lxml").text
    text = text.replace('\n', '')
    text = text.strip()
    return text

data, content_list = scrape_blog(blog_url)

In [65]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words="english")
x = vectorizer.fit_transform(content_list)
km = KMeans(n_clusters=5, init="k-means++", max_iter=100, n_init=1, verbose=True)
km.fit(x)

Initialization complete
Iteration  0, inertia 22.237
Iteration  1, inertia 11.911
Converged at iteration 1: center shift 0.000000e+00 within tolerance 1.938224e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [68]:

np.unique(km.labels_, return_counts=True)

(array([0, 1, 2, 3, 4]), array([4, 7, 3, 3, 3]))

In [69]:
_stopwords = set(stopwords.words('english') + list(punctuation) + ['million', 'billion', 'year', 'millions', 'billions', 'y/y', "'s", "'"])

In [71]:
text = {}
for i, cluster in enumerate(km.labels_):
    oneDocument = content_list[i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument

In [75]:
keywords = {}
counts = {}
for cluster in range(5):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent = [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster] = freq

In [76]:
unique_keys = {}
for cluster in range(5):
    other_clusters = list(set(range(3)) - set([cluster]))
    
    keys_other_cluster = set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique = set(keywords[cluster]) - keys_other_cluster
    unique_keys[cluster] = nlargest(10, unique, key=counts[cluster].get)


In [77]:
unique_keys

{0: ['coding',
  '3.x',
  'version',
  'python3',
  'even',
  'better',
  'properly',
  'want',
  '3.4',
  'standards'],
 1: ["''",
  'response',
  'thread',
  'threads',
  'user',
  'http',
  '8217',
  '--',
  'processes',
  '``'],
 2: ['list', '2', '4', '1', 'x', '8', 'append', '3', 'b', 'element'],
 3: ['name',
  'gc',
  'bar',
  'p2',
  '__',
  'variable',
  'style',
  'c1',
  'parent',
  'classes'],
 4: ['changed',
  '”',
  '“',
  'libraries',
  'method',
  'os.path.dirname',
  'settings',
  'work',
  '__file__',
  'step']}

In [98]:
class MachineModel(object):
    def __init__(self, cluster=3):
        self.cluster = cluster
        self.vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words="english")
        self.km = KMeans(n_clusters=5, init="k-means++", max_iter=100, n_init=1, verbose=True)
        self.classifier = KNeighborsClassifier(n_neighbors=cluster)
        self._stopwords = set(
            stopwords.words('english') +
            list(punctuation) +
            ['--', "''", 'p2', '4', 'x', '8', '10', '3', 'b', '8217', '”','“', "'", '–','3.x', '__', '5', '6', '``', '1']
        )
        
    def add_vector_data(self, data):
        self.data = data
        self.vectordata = self.vectorizer.fit_transform(data)
    
    def fit_km(self):
        self.km.fit(self.vectordata)
    
    def get_unique_cluster_data(self):
        return np.unique(km.labels_, return_counts=True)
        
    def set_cluster_for_each_data(self):
        self.text = {}
        for i, cluster in enumerate(self.km.labels_):
            oneDocument = self.data[i]
            if cluster not in self.text.keys():
                self.text[cluster] = oneDocument
            else:
                self.text[cluster] += oneDocument
                
    def _calculate_count_for_cluster(self):
        self.keywords = {}
        self.counts = {}
        for cluster in range(self.cluster):
            word_sent = word_tokenize(self.text[cluster].lower())
            word_sent = [word for word in word_sent if word not in self._stopwords]
            freq = FreqDist(word_sent)
            self.keywords[cluster] = nlargest(100, freq, key=freq.get)
            self.counts[cluster] = freq
        
        self.unique_keys = {}
        for cluster in range(self.cluster):
            other_clusters = list(set(range(self.cluster)) - set([cluster]))

            keys_other_cluster = set(self.keywords[other_clusters[0]]).union(set(self.keywords[other_clusters[1]]))
            unique = set(self.keywords[cluster]) - keys_other_cluster
            self.unique_keys[cluster] = nlargest(10, unique, key=self.counts[cluster].get)
    
    def _fit_data_to_classifier(self):
        self.classifier.fit(self.vectordata, self.km.labels_)
    
    def train_data(self, data):
        self.add_vector_data(data)
        self.fit_km()
        print("Clusters: ", self.get_unique_cluster_data())
        self.set_cluster_for_each_data()
        self._calculate_count_for_cluster()
        self._fit_data_to_classifier()
    
    def predict(self, data):
        test = self.vectorizer.transform([data])
        return self.classifier.predict(test)
    

article= """List Comprehension is a beautiful feature of Python. But most of the Python beginners (like me) get confused when it comes to nested list comprehension. So I thought to write few lines of Python code that might help you to understand it better. Can you figure out the output of the following statement? [(x, y) for x in range(1, 5) for y in range(0, x)] Here is the output: [(1, 0), (2, 0), (2, 1), (3, 0), (3, 1), (3, 2), (4, 0), (4, 1), (4, 2), (4, 3)] If you could figure it out correctly, you don't need to read the rest of the post. This is actually same as [(x, y) for x in range(1, 5) for y in range(0, x)] Now think about it, experiment with your own ideas and things will be clear."""
obj = MachineModel(5)
obj.train_data(content_list)
x=obj.predict(article)
obj.unique_keys[x[0]]

Initialization complete
Iteration  0, inertia 22.463
Iteration  1, inertia 11.923
Converged at iteration 1: center shift 0.000000e+00 within tolerance 1.938224e-07
Clusters:  (array([0, 1, 2, 3, 4]), array([4, 7, 3, 3, 3]))


['coding',
 'etc',
 'even',
 'properly',
 'standards',
 'flake8',
 'functions',
 'want',
 'suggest',
 'doc-strings']