# DELPHI - document clustering console

In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
import nltk
from ipywidgets import Dropdown, HTML, VBox, Layout
from IPython.display import display, clear_output
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

languages_dropdown = Dropdown(
    options=['english', 'french', 'spanish', 'german', 'russian'],
    value='russian',
    description='Corpus language : ',
)

def on_trait_change(change):
    clear_output()
    stemmer = SnowballStemmer(change['new'])
    stopwords = nltk.corpus.stopwords.words(change['new'])
    
    #show_stopwords = HTML(
    #    value = '\n'.join(str(e) for e in stopwords),
    #    disabled=True,
    #    description='Stopwords : '
    #)
    #
    #text_box = VBox([show_stopwords], layout=Layout(height='100px', overflow_y='auto'))
    #display(text_box)
    print('Stopwords :\n{}'.format(stopwords))

languages_dropdown.observe(on_trait_change, names='value')
display(languages_dropdown)

In [3]:
from ipywidgets import Button

from data import data

titles = []
abstracts = []

def click(b):
    from data import data

    for record in data:
        titles.append(record['title'])
        abstracts.append(record['abstract'])
        
    print('Loaded {} documents.'.format(len(titles)))
    
button = Button(description='Import documents')
button.on_click(click)

button

## K-means clustering with tf-idf weighting

In [4]:
from __future__ import print_function

from ipywidgets import IntSlider, HBox
from sklearn.cluster import KMeans

from main import cluster_abstracts

num_clusters_slider = IntSlider(
    value=5,
    min=0,
    max=10,
    step=1,
    description='Number of clusters : '
)

cluster_button = Button(description='Cluster corpus')

def click(b):
    clear_output()
    
    km, totalvocab_stemmed, totalvocab_tokenized, vocab_frame, tfidf_matrix, terms, frame = cluster_abstracts(titles, abstracts, num_clusters_slider.value)
    
    print('Tokenized vocabulary\n{}'.format([e for e in totalvocab_tokenized]))
    print('Stemmed vocabulary\n{}'.format([e for e in totalvocab_stemmed]))
    
    #show_stems = HTML(
    #    value = '\n'.join(str(e) for e in totalvocab_stemmed),
    #    disabled=True,
    #    description='Stemmed vocabulary : '
    #)
    
    #stems_box = VBox([show_stems], layout=Layout(height='100px', overflow_y='auto'))
    #display(stems_box)
    
    #show_tokens = HTML(
    #    value = '\n'.join(str(e) for e in totalvocab_tokenized),
    #    disabled=True,
    #    description='Tokenized vocabulary : '
    #)
    
    #tokens_box = VBox([show_tokens], layout=Layout(height='100px', overflow_y='auto'))
    #display(tokens_box)
    
    
#   print('Top terms per cluster:\n')
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    
    
    clusters_dropdown = Dropdown(
        options=[e for e in range(num_clusters_slider.value)],
        value=0,
        description='Cluster : ',
    )

    def on_trait_change(change):
        clear_output()
        cluster_terms = order_centroids[change['new']]
        print([terms[e] for e in cluster_terms])
        #show_cluster_terms = HTML(
        #    value = '\n'.join(str(
        #            terms[e]
        #            #vocab_frame.ix[terms[e].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')
        #        ) for e in cluster_terms),
        #    disabled=True,
        #    description='Keywords : '
        #)
        
        #terms_box = VBox([show_cluster_terms], layout=Layout(height='100px', overflow_y='auto'))
        #display(terms_box)
    
    clusters_dropdown.observe(on_trait_change, names='value')
    display(clusters_dropdown)
    
     
cluster_button.on_click(click)

HBox([num_clusters_slider, cluster_button])


## Latent Dirichlet allocation

In [5]:
import string

import numpy as np
from nltk.tag import pos_tag
from gensim import corpora, models, similarities
from ipywidgets import Dropdown, HTML, VBox, Layout
from ipywidgets import IntSlider, HBox

from main import tokenize_and_stem

def strip_proppers(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

def strip_proppers_POS(text):
    tagged = pos_tag(text.split())
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

num_clusters_slider = IntSlider(
    value=5,
    min=0,
    max=10,
    step=1,
    description='Number of clusters : '
)

num_topics_slider = IntSlider(
    value=5,
    min=0,
    max=10,
    step=1,
    description='Number of topics : '
)

lda_button = Button(description='Run LDA')

stopwords = nltk.corpus.stopwords.words('english')

def click(b):
    preprocess = [strip_proppers(abstract) for abstract in abstracts]
    tokenized_text = [tokenize_and_stem(text) for text in preprocess]
    texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=1, no_above=0.8)
    corpus = [dictionary.doc2bow(text) for text in texts]

    lda = models.LdaModel(corpus, num_topics=num_topics_slider.value,
                         id2word=dictionary,
                         update_every=5,
                         chunksize=10000,
                         passes=100)

    topics_dropdown = Dropdown(
        options=[e for e in range(num_topics_slider.value)],
        value=0,
        description='Cluster : ',
    )

    def on_trait_change(change):
        clear_output()
        
        string = lda.show_topics()[change['new']][1]
        
        print('Weighted topics :\n{}'.format(string))
        #show_keywords = HTML(
        #    value = string,
        #    disabled=True,
        #    description='Weight, keyword : '
        #)
    
        #terms_box = VBox([show_keywords], layout=Layout(height='100px', overflow_y='auto'))
        #display(terms_box)
    topics_dropdown.observe(on_trait_change, names='value')
    display(topics_dropdown)

lda_button.on_click(click)

HBox([num_topics_slider, lda_button])

In [6]:
from ipywidgets import Textarea, Button, HBox

remove_terms_textarea = Textarea(
    placeholder='term1, term2, term3, ...',
    description='Terms :'
)

remove_button = Button(
    description='Remove from corpus'
)

def click(b):
    global abstracts
    for term in remove_terms_textarea.value.split(", "):
        abstracts = [abstract.replace(term, "") for abstract in abstracts]

remove_button.on_click(click)
        
HBox([remove_terms_textarea, remove_button])