# Get the comments' HTML

In [60]:
import requests
from bs4 import BeautifulSoup
url = "http://www.theguardian.com/discussion/p/4fqc7"
r = requests.get(url)
html = r.text

soup = BeautifulSoup(html, "html.parser")
comments = soup.select(".d-comment__main")
comment_authors = soup.select(".d-comment__author")

print len (comments), " comments found in first page."
print len (comment_authors), " authors found in first page."

50  comments found in first page.
50  authors found in first page.


# Extract the comments

In [61]:
comments_dict = []
parsed_comments = []
parsed_authors = []
for comment, author in zip(comments, comment_authors):
    c = comment.select(".d-comment__body")[0].text
    a = author['title']
    comments_dict.append({"text": c, "author": a})
    parsed_comments.append(c)
    parsed_authors.append(a)
    
print comments_dict[:6]

[{'text': u'\nIf you think what we have now is working then your idea of " intellectual sophistication " needs reassessing. \nI doubt you\'re capable of such a simple feat. \nBack to Tory Central office for you. Judging by the rest of your comments your masters aren\'t going to keep you for much longer.\n', 'author': u'excathedra'}, {'text': u"\nWe have punitive tax rates for those at the bottom, with tax rates far higher than those at the top.\n...which is clearly bonkers. Why would I support that?\n\nIt's clear when you resort to childish insults you've not got any argument based on evidence. The evidence is there for a rebalancing but sadly your ideology refuses to acknowledge it.\n\nIt is clear that you lack the intellectual sophistication required, to avoid regarding anyone right of centre as a straw man clich\xe9 who conforms to your own crude prejudices about them.\n", 'author': u'yourcomment'}, {'text': u"\nI agree. Taxpayers contribute approx 2k per year to the NHS and yet hav

# Create comment stemmer and TFIDF vectorizer
We will do some stemming in the comment text in order to create shorter vectors that represent each comment.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer=super(StemmedTfidfVectorizer,self).build_analyzer()
        return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))

# Vectorize extracted comments

In [89]:
stem_vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
stem_analyze = stem_vectorizer.build_analyzer()
# print [tok for tok in stem_analyze ("When we have a real living wage, there will no longer need to be 'stupid tax credits'. Until then, people need a top up to support themselves, because the companies they work for, don't want to give people their dues.")]

comment_vectors = stem_vectorizer.fit_transform(parsed_comments)

print "%d features found" % (len(stem_vectorizer.get_feature_names()))
print stem_vectorizer.get_feature_names()

865 features found
[u'000', u'10', u'100', u'11', u'15', u'175', u'18', u'1920s', u'1980s', u'1983', u'1987', u'20', u'2008', u'2011', u'2015', u'2020', u'21', u'23', u'2k', u'30', u'320', u'3bn', u'40', u'400', u'40bn', u'40s', u'50', u'56', u'636', u'84', u'91', u'abid', u'abl', u'accord', u'account', u'accur', u'acknowledg', u'acquaint', u'actual', u'acut', u'addit', u'adequ', u'admiss', u'advis', u'advisor', u'afford', u'age', u'agenda', u'agent', u'agre', u'agricultur', u'amount', u'analys', u'analysi', u'annual', u'annum', u'answer', u'anti', u'approach', u'approx', u'aren', u'arent', u'argument', u'arrog', u'arthriti', u'articl', u'asid', u'ask', u'asset', u'associ', u'assum', u'attempt', u'author', u'averag', u'avoid', u'awar', u'away', u'backsid', u'bank', u'banker', u'base', u'basi', u'basic', u'bastard', u'bbc', u'beard', u'begun', u'behalf', u'believ', u'benefit', u'best', u'better', u'big', u'bit', u'blinker', u'bn', u'bodi', u'bone', u'bonker', u'book', u'borough', u'borr

## These are the vectorized comments

In [90]:
formatted = ["Comment #{0}\n{1}".format(i,cv) for i, cv in enumerate(comment_vectors)]
for f in formatted:
    print f

Comment #0
  (0, 456)	0.205887006719
  (0, 330)	0.192929186203
  (0, 60)	0.205887006719
  (0, 471)	0.246137673625
  (0, 158)	0.222592542856
  (0, 662)	0.222592542856
  (0, 420)	0.246137673625
  (0, 524)	0.246137673625
  (0, 129)	0.222592542856
  (0, 795)	0.205887006719
  (0, 288)	0.246137673625
  (0, 713)	0.205887006719
  (0, 121)	0.246137673625
  (0, 242)	0.246137673625
  (0, 631)	0.246137673625
  (0, 502)	0.182341875949
  (0, 731)	0.222592542856
  (0, 405)	0.222592542856
  (0, 376)	0.182341875949
  (0, 852)	0.147143910764
  (0, 787)	0.182341875949
Comment #1
  (0, 588)	0.16162547186
  (0, 192)	0.16162547186
  (0, 166)	0.16162547186
  (0, 150)	0.16162547186
  (0, 466)	0.146164641282
  (0, 751)	0.16162547186
  (0, 128)	0.16162547186
  (0, 670)	0.126686298348
  (0, 639)	0.146164641282
  (0, 74)	0.146164641282
  (0, 653)	0.146164641282
  (0, 429)	0.126686298348
  (0, 36)	0.146164641282
  (0, 638)	0.146164641282
  (0, 378)	0.135195007414
  (0, 684)	0.126686298348
  (0, 632)	0.146164641282

# Apply clustering algorithm to vectorized comments

## KMeans with 4 clusters

In [97]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=4, init='k-means++',
            max_iter=100, n_init=1)
km.fit(comment_vectors)

# Top terms per cluster (out of the 4 clusters)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = stem_vectorizer.get_feature_names()
for i in range(4):
    print "Cluster %d:"%(i)
    for ind in order_centroids[i, :10]:
        print " %s" % terms[ind]
    print ""


Cluster 0:
 fund
 yes
 doe
 increas
 hous
 agre
 nhs
 head
 okay
 deal

Cluster 1:
 explain
 emigr
 suppos
 diggin
 beard
 comment
 simpl
 man
 start
 big

Cluster 2:
 clear
 tax
 rate
 got
 evid
 know
 intellig
 argument
 idea
 sure

Cluster 3:
 pension
 debt
 state
 banker
 trillion
 just
 don
 money
 care
 borrow



!git add -A && git commit -m "Clusters comments in 4 clusters using kmeans. Now I intend to use agglomerative clustering on the data."

# Questions and thoughts
* The k-means clustering scikit tutorial, talks about dimensionality reduction using LSA, doesn't this makes us loose information? (This is why I didn't do that normalization)
* After the clustering algorithm is applied, how do we evaluate its effectiveness? The question comes because this is still only halfway through our summarizing pipeline. We still need to extract the most important comments from each cluster that are non-overlapping and best represent the discussion.
* This very simple approach to summarization is not taking into account dialogue acts. We can use the reply meta information in the comments to tell when a comment is an actual response to another comment. I am not sure how to use this information though. An idea I have is to do some summarization of each set of replies; so, we are basically summarizing the little sub-conversations as well.