# Soft Clustering

## Import Required Packages

In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Read Data

In [2]:
ieee = pd.read_csv("Data/IEEE-Computer-Science-2017.csv")
ieee["Combined_text"] = ieee["Title"] + ieee["Abstract"] + ieee["Keywords"]
raw = ieee["Combined_text"]
raw = raw.dropna()

## Data Preprocessing

In [117]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string
#Tokenize the text
def tokenize(text):

    #Create Stemmer
    #stemmer = PorterStemmer()
    stemmer = WordNetLemmatizer()

    #Remove irrelevant character
    text = re.sub(r"[^a-zA-Z]", ' ', text)

    #Tokenization
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]

    #Stemming
    stems = stem_tokens(tokens, stemmer)
    return stems

#Stemming Function
def stem_tokens(t,s):
    stemmed=[]
    for item in t:
        # stemmed.append(s.stem(item))
        stemmed.append(s.lemmatize(item))
    return stemmed


# tfidf_vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize, min_df=50)
max_feature = 800
tfidf_vectorizer_auto = TfidfVectorizer(stop_words='english', tokenizer=tokenize, max_df=500, max_features=600)
tfidf_data = tfidf_vectorizer_auto.fit_transform(raw)
print tfidf_data.shape
tfidf_data.toarray()



(1483, 600)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.04080061,  0.        ],
       [ 0.0625694 ,  0.        ,  0.        , ...,  0.04504443,
         0.0563764 ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.05041872, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Word2Vector

In [100]:
import gensim
from nltk import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

stemmer = WordNetLemmatizer()
# stemmer = PorterStemmer()
sentences = []
paper_list = []
for paper in raw:
    word_list = []
    for sent in sent_tokenize(paper.decode('utf-8')):
        words = word_tokenize(sent)
        # sentences.append([stemmer.stem(word.lower()) for word in words if word not in string.punctuation])
        sentences.append([stemmer.lemmatize(word.lower()) for word in words if word not in string.punctuation and word not in stops and not word.isdigit()])
        word_list += words
    paper_list.append(word_list)
word2vec_model = gensim.models.Word2Vec(sentences, size=400, window=6, min_count=20, workers=4, iter=1000)


In [101]:


for i in range(len(paper_list)):
    new_list = []
    for word in paper_list[i]:
        if word.lower() not in stops:
            new_list.append(word)
    paper_list[i] = new_list

In [102]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = tokenize)
tfidf_data = tfidf_vectorizer.fit_transform(raw)

In [103]:
paper_vector = []
paper_count = 0
for paper in paper_list:
    vector = np.zeros(400)
    diviser = 0
    for word in paper:
        if word in word2vec_model and word in tfidf_vectorizer.vocabulary_:
            tfidf_value = tfidf_data[paper_count,tfidf_vectorizer.vocabulary_[word]]
            diviser += tfidf_value
            vector += word2vec_model[word]*tfidf_value
    paper_vector.append(vector / diviser)
    paper_count += 1

  import sys
  # Remove the CWD from sys.path while we load stuff.


In [104]:
data_word2vec = np.array(paper_vector)
data_word2vec.shape

(1483, 400)

## Autoencoder

In [53]:
import pylab
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import os



In [54]:
def init_weights(n_visible, n_hidden):
    initial_W = np.asarray(
        np.random.uniform(
            low=-4 * np.sqrt(6. / (n_hidden + n_visible)),
            high=4 * np.sqrt(6. / (n_hidden + n_visible)),
            size=(n_visible, n_hidden)),
        dtype=theano.config.floatX)
    return theano.shared(value=initial_W, name='W', borrow=True)

def init_bias(n):
    return theano.shared(value=np.zeros(n,dtype=theano.config.floatX),borrow=True)

In [123]:
x = T.fmatrix('x')  
d = T.fmatrix('d')


rng = np.random.RandomState(123)
theano_rng = RandomStreams(rng.randint(2 ** 30))


training_epochs = 200
learning_rate = 0.1
batch_size = 16
first_dimension = min(50,int(max_feature*0.7))
# second_dimension = 50

W1 = init_weights(tfidf_data.shape[1], first_dimension)
b1 = init_bias(first_dimension)
b1_prime = init_bias(tfidf_data.shape[1])
W1_prime = W1.transpose() 
# W2 = init_weights(first_dimension, second_dimension)
# b2 = init_bias(second_dimension)
# W2_prime = W2.transpose()
# b2_prime = init_bias(first_dimension)

y1 = T.nnet.sigmoid(T.dot(x, W1) + b1)
# y2 = T.nnet.sigmoid(T.dot(y1,W2) + b2)
# z2 = T.nnet.sigmoid(T.dot(y2, W2_prime) + b2_prime)
z1 = T.nnet.sigmoid(T.dot(y1, W1_prime) + b1_prime)
cost1 = T.sum((x-z1)**2)

params1 = [W1, b1, b1_prime]
grads1 = T.grad(cost1, params1)
updates1 = [(param1, param1 - learning_rate * grad1)
           for param1, grad1 in zip(params1, grads1)]
train_da1 = theano.function(inputs=[x], outputs = cost1, updates = updates1, allow_input_downcast = True)
test = theano.function(inputs=[x], outputs = y1, allow_input_downcast = True)
upsampling = theano.function(inputs=[y1], outputs = z1, allow_input_downcast = True)

In [None]:
print('training dae1 ...')
d = []
for epoch in range(training_epochs):
    # go through trainng set
    c = []
    for start, end in zip(range(0, tfidf_data.shape[0], batch_size), range(batch_size, tfidf_data.shape[0], batch_size)):
        c.append(train_da1(tfidf_data.toarray()[start:end]))
    d.append(np.mean(c, dtype='float64') / batch_size)
    print(d[epoch])

training dae1 ...
2.81372889943
0.938533686517
0.935897322708
0.934292089702
0.932874402311
0.931305809726
0.929279202763
0.926574815161
0.923542339562
0.92093263833
0.918838786384
0.916903161026
0.91482881625
0.912435098219
0.909649132539
0.906519033168
0.903150249281
0.899569825896
0.895685512164
0.891494068704
0.887245851225
0.883058538235
0.878887300092
0.874775939124
0.870751638274
0.866745154619
0.862680453923
0.858520306262
0.854261541528
0.850041971324
0.846087405964
0.842436761954
0.838972522607
0.835582436528
0.832209233066
0.82883555787
0.825462372455
0.822096953161
0.81874764056
0.815418283878
0.812101128564
0.808771857753
0.805389413455
0.801900476543
0.798250639758
0.794406312542


In [66]:
data = test(tfidf_data.toarray())
print data

[[ 0.79119289  0.21104235  0.55880483 ...,  0.70685674  0.02187131
   0.26534901]
 [ 0.52292467  0.62275458  0.40969103 ...,  0.27858279  0.01665859
   0.67977598]
 [ 0.7399837   0.38220733  0.45604365 ...,  0.13080241  0.01343907
   0.42152503]
 ..., 
 [ 0.40882439  0.68291476  0.30476679 ...,  0.49719277  0.01499242
   0.39519157]
 [ 0.43642496  0.44110755  0.29942032 ...,  0.44078169  0.0124258
   0.47495906]
 [ 0.50832936  0.28121095  0.16326793 ...,  0.23035646  0.00860837
   0.28306392]]


## Topic Tree Class

In [105]:
import requests
class TopicTree:
    def __init__(self, level=0, index=1, children=[], keywords=[], n_top_words=10):
        self.children = children
        self.keywords = keywords
        self.level = level
        self.index = index
        self.dup_index = 0
        self.association_index = 0
        self.base_url = 'https://api.wordassociations.net/associations/v1.0/json/search?apikey=85618aa1-21c2-4382-9dc6-373c5b1424b8&lang=en&limit=50'
        self.n_top_words = n_top_words
    
    def get_number_of_children(self):
        return len(self.children)

    def get_children(self):
        return children
    
    def get_topic_content(self):
        return self.topic_content

    def compute_dup_index(self):
        if not self.children:
            return
        children_keywords = []
        for child in self.children:
            children_keywords += child.keywords
            child.compute_dup_index()
        original_len = len(children_keywords)
        children_keywords = list(set(children_keywords))
        num_of_duplicate = original_len - len(children_keywords)
        self.dup_index = float(num_of_duplicate) / original_len
        
    def get_whole_tree_dup_index(self):
        if self.dup_index == 0:
            self.compute_dup_index()
        if not self.children:
            return self.dup_index
        children_index = 0
        for child in self.children:
            children_index += child.dup_index
        children_index = children_index / len(self.children)
        return (self.dup_index + children_index) / 2
    
    def compute_association_index(self):
        base_url = self.base_url
        word_sum = 0
        if len(self.keywords) <= 10:
            for word in self.keywords:
                base_url = base_url + '&text=' + word
            response = requests.get(base_url).json()['response']
        else:
            for word in self.keywords[:10]:
                base_url = base_url + '&text=' + word
            response = requests.get(base_url).json()['response']
            base_url = self.base_url
            for word in self.keywords[10:]:
                base_url = base_url + '&text=' + word
            response = response + requests.get(base_url).json()['response']
        for word_json in response:
            items = word_json['items']
            for item in items:
                if item['item'] in self.keywords:
                    word_sum += int(item['weight'])
        self.association_index = float(word_sum) / self.n_top_words / (self.n_top_words - 1)
    
    def get_whole_tree_association_index(self):
        if self.association_index == 0:
            self.compute_association_index()
        if not self.children:
            return self.association_index
        children_index = 0
        for child in self.children:
            children_index += child.association_index
        children_index = children_index / len(self.children)
        return (self.dup_index + children_index) / 2
            
            

## Hierarchical Soft Clustering Class

In [68]:
class FuzzyClustering:
    
    def __init__(self,weight_vector, dictionary, epochs=10, num_topic_range=10,m=2, word2vec=False):
        self.epochs = epochs
        self.num_topic_range = num_topic_range
        self.m = m
        self.pre_cost = 100000000
        self.weight_vector = weight_vector
        self.dictionary = dictionary
        self.word2vec = word2vec
        
    def get_silhouette_coefficient(self):
        num_doc = self.data.shape[0]
        num_topic = self.doc_topic_weight_mat.shape[1]
        topic_format_data_list=[]
        for topic_index in range(num_topic):
            topic_format_data_list.append(((self.data.T)*self.doc_topic_weight_mat[:,topic_index]).T)  
        sil_coe_all = 0
        
    
        for doc_index in range(num_doc):
            for topic_index in range(num_topic):
                doc = topic_format_data_list[topic_index][doc_index]
                a = np.mean(np.linalg.norm(topic_format_data_list[topic_index] - doc, axis=1))
                b = np.finfo(float).max
                sil_coe = -1
                for topic_index2 in range(num_topic):
                    if topic_index2 != topic_index:
                        b = min(b, np.mean(np.linalg.norm(topic_format_data_list[topic_index2] - doc, axis=1)))
                sil_coe = max(sil_coe, (b - a) / max(a,b))
            sil_coe_all += sil_coe
        return sil_coe_all / num_doc
    
    def get_MPC(self):
        num_doc = self.doc_topic_weight_mat.shape[0]
        num_topic = self.doc_topic_weight_mat.shape[1]
        pc = np.sum(np.square(self.doc_topic_weight_mat)) / num_doc
        if num_topic == 1:
            mpc = 0.90
        else:
            mpc = 1 - float(num_topic) / (num_topic - 1) * (1 - pc)
        return mpc
    
    def do_clustering(self,doc_word_mat, num_cluster=0, num_topic=10, exp=False):
        num_doc = doc_word_mat.shape[0]
        doc_topic_weight_mat = self.initialize_weight_mat(num_doc,num_topic)
        centroids = self.initialize_centroid(doc_word_mat, num_topic)
        self.pre_cost = 1000000000
        for epoch in range(self.epochs):
            doc_topic_weight_mat = self.update_weight_mat(doc_word_mat, doc_topic_weight_mat, centroids, exp) 
            cost = self.get_cost(doc_word_mat, doc_topic_weight_mat, centroids)
            # print cost
            if cost > self.pre_cost:
                self.data = doc_word_mat
                self.doc_topic_weight_mat = doc_topic_weight_mat
                self.centroids = centroids
                return doc_topic_weight_mat, centroids
            self.pre_cost = cost
            centroids = self.compute_centroid(doc_word_mat, doc_topic_weight_mat)
        # doc_topic_weight_mat = (doc_topic_weight_mat.T / doc_topic_weight_mat.sum(axis=1)).T
        self.data = doc_word_mat
        self.doc_topic_weight_mat = doc_topic_weight_mat
        self.centroids = centroids
        return doc_topic_weight_mat, centroids
                
        
    def compute_centroid(self, doc_word_mat, doc_topic_weight_mat):
        num_cen = doc_topic_weight_mat.shape[1]
        num_word = doc_word_mat.shape[1]
        
        centroids = np.zeros((num_cen, num_word))
        for cen_index in range(num_cen):
            centroids[cen_index] = np.dot(np.power(doc_topic_weight_mat.T[cen_index], self.m)*self.weight_vector, doc_word_mat) / np.sum(np.power(doc_topic_weight_mat.T[cen_index],self.m)*self.weight_vector)
        return centroids
    
    def initialize_centroid(self, doc_word_mat, num_topic):
        return doc_word_mat[np.random.randint(0,doc_word_mat.shape[0],num_topic),:] + 0.0001
    
    def initialize_weight_mat(self,row,col):
        # return np.random.uniform(low=0, high=2.0/col, size=(row,col))
        return np.full((row,col),1.0/col, dtype=float)
    
    def update_weight_mat(self,doc_word_mat, doc_topic_weight_mat, centroids, exp=False):
        num_doc = doc_topic_weight_mat.shape[0]
        num_topic = doc_topic_weight_mat.shape[1]
        
        if exp:
            doc_topic_distance = np.zeros((num_doc, num_topic))
            for topic_index in range(num_topic):
                Fk = np.zeros((doc_word_mat.shape[1], doc_word_mat.shape[1]))
                Fk_diviser = np.sum(doc_topic_weight_mat[:,topic_index]*self.weight_vector)
                
                for doc_index in range(num_doc):
                    Fk += doc_topic_weight_mat[doc_index][topic_index]*self.weight_vector[doc_index]*np.outer(centroids[topic_index] - doc_word_mat[doc_index], centroids[topic_index] - doc_word_mat[doc_index])
                Fk = Fk/Fk_diviser
                Fk_determiner = np.linalg.det(Fk)**0.5
                Fk_inverse = np.linalg.inv(Fk)
                ak = np.sum(doc_topic_weight_mat[:,topic_index]) / np.sum(self.weight_vector)
                for doc_index in range(num_doc):
                    difference = centroids[topic_index] - doc_word_mat[doc_index]
                    doc_topic_distance[doc_index][topic_index] = Fk_determiner / ak*np.exp(np.dot(np.dot(difference, Fk_inverse),difference/2))
                    
            
            for doc_index in range(num_doc):
                dist_to_all_clusters = np.sum(doc_topic_distance[doc_index]) + np.finfo(float).eps
                for topic_index in range(num_topic):
                    doc_topic_weight_mat[doc_index][topic_index] = np.power(doc_topic_distance[doc_index][topic_index] / dist_to_all_clusters, 2.0/(self.m-1))
            
        else:
            for doc_index in range(num_doc):
                dist_to_all_cluster = np.linalg.norm(centroids - doc_word_mat[doc_index], axis=1)

                new_weight = np.zeros(num_topic)
                for topic_index in range(num_topic):

                    new_weight += np.power(dist_to_all_cluster / (np.linalg.norm(centroids[topic_index] - doc_word_mat[doc_index])+np.finfo(float).eps),2.0/(self.m-1))
                    # new_weight += np.power(dist_to_all_cluster / (1-np.sum(centroids[topic_index] * doc_word_mat[doc_index])/ np.linalg.norm(centroids[topic_index]) / np.linalg.norm(doc_word_mat[doc_index])),2/(self.m-1))

                doc_topic_weight_mat[doc_index] = 1.0 / new_weight
           
        
        return doc_topic_weight_mat
    
    def get_cost(self, doc_word_mat, doc_topic_weight_mat, centroids):
        cost = 0
        num_doc = doc_topic_weight_mat.shape[0]
        num_topic = doc_topic_weight_mat.shape[1]
        
        for doc_index in range(num_doc):
            for topic_index in range(num_topic):
                cost += doc_topic_weight_mat[doc_index, topic_index]**self.m *self.weight_vector[doc_index]* np.linalg.norm(centroids[topic_index] - doc_word_mat[doc_index])
                # cost += doc_topic_weight_mat[doc_index, topic_index]**self.m * (1-np.sum(centroids[topic_index] * doc_word_mat[doc_index])/ np.linalg.norm(centroids[topic_index]) / np.linalg.norm(doc_word_mat[doc_index]))
        return cost

    def print_top_words(self,n_top_words, topics, feature_names):
        topic_idx = 0
        if self.word2vec:
            for topic in topics:
                topic_words = [x[0] for x in word2vec_model.wv.similar_by_vector(topic, topn=n_top_words)]
                # topic_words.sort()
                topic_idx += 1
                message = "Topic #%d: " % topic_idx
                message += " ".join(topic_words)
                print message
        else:
            topics = upsampling(topics)
            for topic in topics:
                topic_idx += 1
                message = "Topic #%d: " % topic_idx
                topic_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
                message += " ".join(topic_words)
                print message
    
    def get_top_words(self, level, n_top_words, topics, feature_names):
        children = []
        topic_idx = 0
        if self.word2vec:
            for topic in topics:
                topic_words = [x[0] for x in word2vec_model.wv.similar_by_vector(topic, topn=n_top_words)]
                topic_idx += 1
                children.append(TopicTree(level=level, index=topic_idx, keywords=topic_words, n_top_words=n_top_words))
                
        else:
            topics = upsampling(topics)
            for topic in topics:
                topic_idx += 1
                topic_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
                children.append(TopicTree(level=level, index=topic_idx, keywords=topic_words, n_top_words=n_top_words))
        return children
                

In [50]:
class Hierarchical_Fuzzy_Clustering:
    def __init__(self,dictionary, max_level=3,epochs=10, m=2, max_topic=5, n_top_words=10):
        self.dictionary = dictionary
        self.max_level = max_level
        self.m = m
        self.max_topic = max_topic
        self.epochs = epochs
        self.topic_tree = TopicTree(level=1, index=1)
        self.current_tree = self.topic_tree
        self.n_top_words = n_top_words
    
    def do_hierarchical_clustering(self,data, weight_vector, level=1,topic_index=1, word2vec=False):
        
        if level==1:
            self.build_root(data, word2vec)
            
        if level > self.max_level:
            return
        
        print "\nlevel %d topic %d" %(level, topic_index)
        model = FuzzyClustering(dictionary=self.dictionary, weight_vector=weight_vector, epochs=self.epochs, m=self.m, word2vec=word2vec)
        target_MPC = 0
        target_doc_topic_weight_matrix = np.array([])
        target_topics = np.array([])
        for num_topic in range(1,self.max_topic+1): 
            doc_topic_weight_matrix, topics = model.do_clustering(data, num_topic=num_topic)
            MPC = model.get_MPC()
            if MPC > target_MPC:
                target_MPC = MPC
                target_doc_topic_weight_matrix = doc_topic_weight_matrix
                target_topics = topics
            
        num_topics = target_doc_topic_weight_matrix.shape[1]
        print "Detect %d topics" %num_topics
        model.print_top_words(self.n_top_words, target_topics, self.dictionary)
        self.current_tree.children = model.get_top_words(level + 1, self.n_top_words, target_topics, self.dictionary)
        
        children = self.current_tree.children
        if num_topics > 1:
            for topic_index in range(num_topics):
                self.current_tree = children[topic_index]
                self.do_hierarchical_clustering(data, target_doc_topic_weight_matrix.T[topic_index], level=level+1, topic_index = topic_index+1, word2vec=word2vec)
    

    def build_root(self, data, word2vec=False):
        print "\nlevel 0"
        message = "#Topic 1: " 
        if word2vec:
            centroids = np.mean(data, axis=0)
            topic_words = [x[0] for x in word2vec_model.wv.similar_by_vector(centroids, topn=20)]
            # topic_words.sort()


        else:
            centroids = np.mean(data,axis=0)
            centroids = upsampling([centroids])
            feature_names = self.dictionary
            topic_words = [feature_names[i]
                                 for i in centroids[0].argsort()[:20]]

        self.current_tree.keywords = topic_words
        message += " ".join(topic_words)   
        print message

## Hierarchical Clustering on Autoencoder

In [112]:
auto_model = Hierarchical_Fuzzy_Clustering(dictionary=tfidf_vectorizer_auto.get_feature_names(), m=1.01,max_level=2, max_topic=10, n_top_words=20)
auto_model.do_hierarchical_clustering(data, weight_vector=np.zeros(data.shape[0])+1, word2vec = False)


level 0
#Topic 1: concept proposes example obtain focus iot comparison traditional respectively fault effectively mimo called lower dictionary benefit relay outperforms color antenna

level 1 topic 1
Detect 8 topics
Topic #1: optimization problem solution approach objective optimal function distributed analysis design vehicle parameter network dynamic strategy framework learning set process power
Topic #2: image feature learning camera approach quality human extraction object training classifier task sparse database information representation imaging framework different analysis
Topic #3: control power stability controller dynamic nonlinear state distributed analysis grid optimal vehicle time problem design output approach uncertainty attack process
Topic #4: information task user network social traffic framework analysis human real approach large cluster learning search video propose computing modeling mobile
Topic #5: high frequency circuit power bandwidth low measurement current mo

Detect 3 topics
Topic #1: network wireless energy node sensor communication routing transmission protocol topology scheme design mobile distributed throughput monitoring optimal receiver traffic problem
Topic #2: interference receiver scheme user access channel rate multiple wireless radio transmission network throughput communication signal spectrum power allocation layer ratio
Topic #3: d network user game cellular communication interference resource device cell framework allocation power energy multiple information strategy management wireless transmission


In [113]:
auto_model.topic_tree.get_whole_tree_dup_index()

0.37144097222222217

In [114]:
auto_model.topic_tree.get_whole_tree_association_index()

0.1625

## Hierarchical Clustering on Word2Vec

In [108]:
word2_model = Hierarchical_Fuzzy_Clustering(dictionary=tfidf_vectorizer.get_feature_names(), m=1.01,max_level=2, max_topic=10, n_top_words=20)
word2_model.do_hierarchical_clustering(data_word2vec, weight_vector=np.zeros(data_word2vec.shape[0])+1, word2vec = True)


level 0
#Topic 1: control power system network communication energy resource architecture noise wireless load estimation performance voltage optimization computing harvesting distributed algorithm processor

level 1 topic 1
Detect 8 topics
Topic #1: control nonlinear adaptive stability controller feedback system multiagent dynamic consensus disturbance fuzzy dynamical event-triggered uncertain time-varying synchronization optimal closed-loop distributed
Topic #2: image feature learning recognition method dictionary color visual sparse saliency classification face training descriptor segmentation representation multimodal object reconstruction discriminative
Topic #3: localization indoor sleep positioning monitoring fall wearable tracking health sensor navigation location activity malware healthcare tactile radar detection phone anomaly
Topic #4: mining big set partitioning data analysis leverage semantic view city semantics literature social privacy-preserving leading variety learning

Detect 10 topics
Topic #1: trust identity social risk secure negotiation data-driven infrastructure worker cloud malicious match providing pricing program security incentive effective service resource
Topic #2: smart software education emotion internet cloud service big computing application future architecture mobile crowdsourcing broadband data science iot technology engineering
Topic #3: adversary data effectively electricity measured request without theoretical injection manual eavesdropper utility packet thing jamming under malicious information tradeoff mobility
Topic #4: topic social data importance interaction issue business visual transformed blind understanding discovery activity people object biological identified perspective brain information
Topic #5: cell required array association histogram port approximated interface semantics road create percent validation at cellular scale term volume fabrication job
Topic #6: diagnosis fault motor disease patient driver extended micr

In [109]:
word2_model.topic_tree.get_whole_tree_dup_index()

0.07322916666666666

In [110]:
word2_model.topic_tree.get_whole_tree_association_index()

0.028125

In [43]:
word2_model.topic_tree.association_index

0.0

## Check Cluster Validity Index

In [138]:
model = FuzzyClustering(dictionary = tfidf_vectorizer.get_feature_names(), weight_vector=np.zeros(data_word2vec.shape[0])+1, epochs=50, m=1.04)
for i in range(1,5):
    print i
    a, topics_word2vec = model.do_clustering(data_word2vec,num_topic=i)
    print model.get_silhouette_coefficient()
    print model.get_MPC()

1
1.0
0.9
2
0.128502600246
0.971036468703
3
-0.0823058320283
0.952725751718
4
-0.395483622351
0.960163612987


## Word Association API

In [20]:
import requests
base_url = 'https://api.wordassociations.net/associations/v1.0/json/search?apikey=85618aa1-21c2-4382-9dc6-373c5b1424b8&text=%s&lang=en&limit=10'
print requests.get(base_url % 'shit').json()['response'][0]['items']

[{u'item': u'Fuck', u'pos': u'verb', u'weight': 100}, {u'item': u'Pant', u'pos': u'noun', u'weight': 87}, {u'item': u'Clete', u'pos': u'noun', u'weight': 67}, {u'item': u'Brick', u'pos': u'noun', u'weight': 66}, {u'item': u'Bastard', u'pos': u'noun', u'weight': 65}, {u'item': u'Bastard', u'pos': u'adjective', u'weight': 62}, {u'item': u'Ass', u'pos': u'noun', u'weight': 59}, {u'item': u'Scared', u'pos': u'adjective', u'weight': 58}, {u'item': u'Eating', u'pos': u'adjective', u'weight': 55}, {u'item': u'Sitter', u'pos': u'noun', u'weight': 52}]


In [None]:
import gensim
from nltk import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string
import pylab
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import os


class DimensionReductionData:
    def __init__(self,raw_data):
        self.raw_data = raw_data,
        self.word2vec_model = None,
        self.word2vec_data = None,
        self.auto_data = None,
        self.tfidf_vectorizer_word2vec = None,
        self.tfidf_data_word2vec = None,
        self.tfidf_vectorizer_auto = None,
        self.tfidf_data_auto = None,
        self.test_auto = None,
        self.train_auto = None,
        self.upsampling_auto = None,
    
    def build_word2vec(self, n_iter=1000, window_size=5, dimension=600):
        stops = set(stopwords.words('english'))
        stemmer = WordNetLemmatizer()
        # stemmer = PorterStemmer()
        sentences = []
        paper_list = []
        for paper in raw:
            word_list = []
            for sent in sent_tokenize(paper.decode('utf-8')):
                words = word_tokenize(sent)
                # sentences.append([stemmer.stem(word.lower()) for word in words if word not in string.punctuation])
                sentences.append([stemmer.lemmatize(word.lower()) for word in words if word not in string.punctuation and word not in stops and not word.isdigit()])
                word_list += words
            paper_list.append(word_list)
        self.word2vec_model = gensim.models.Word2Vec(sentences, size=400, window=6, min_count=20, workers=4, iter=1000)
        for i in range(len(paper_list)):
            new_list = []
            for word in paper_list[i]:
                if word.lower() not in stops:
                    new_list.append(word)
            paper_list[i] = new_list
        self.tfidf_vectorizer_word2vec = TfidfVectorizer(tokenizer = tokenize)
        tfidf_data_word2vec = tfidf_vectorizer_word2vec.fit_transform(raw)
        
        paper_vector = []
        paper_count = 0
        for paper in paper_list:
            vector = np.zeros(400)
            diviser = 0
            for word in paper:
                if word in word2vec_model and word in tfidf_vectorizer.vocabulary_:
                    tfidf_value = tfidf_data[paper_count,tfidf_vectorizer.vocabulary_[word]]
                    diviser += tfidf_value
                    vector += word2vec_model[word]*tfidf_value
            paper_vector.append(vector / diviser)
            paper_count += 1
        word2vec_data = np.array(paper_vector)
    
    def build_auto(self, n_iter=200, dimension=600, learning_rate=0.1, batch_size=16):
        self.tfidf_vectorizer_auto = TfidfVectorizer(stop_words='english', tokenizer=tokenize, max_df=500, max_features=dimension)
        self.tfidf_data_auto = self.tfidf_vectorizer_auto.fit_transform(raw)
        self.create_network(n_iter=n_iter)
    
        print('training dae1 ...')
        d = []
        for epoch in range(n_iter):
            # go through trainng set
            c = []
            for start, end in zip(range(0, self.tfidf_data_auto.shape[0], batch_size), range(batch_size, self.tfidf_data_auto.shape[0], batch_size)):
                c.append(self.train_auto(self.tfidf_data_auto.toarray()[start:end]))
            d.append(np.mean(c, dtype='float64') / batch_size)
            print(d[epoch])
        self.auto_data = test(self.tfidf_data_auto.toarray())
        
    def tokenize(self, text):

        #Create Stemmer
        #stemmer = PorterStemmer()
        stemmer = WordNetLemmatizer()

        #Remove irrelevant character
        text = re.sub(r"[^a-zA-Z]", ' ', text)

        #Tokenization
        tokens = word_tokenize(text)
        tokens = [i for i in tokens if i not in string.punctuation]

        #Stemming
        stems = self.stem_tokens(tokens, stemmer)
        return stems

    #Stemming Function
    def stem_tokens(self, t,s):
        stemmed=[]
        for item in t:
            # stemmed.append(s.stem(item))
            stemmed.append(s.lemmatize(item))
        return stemmed

    def init_weights(self, n_visible, n_hidden):
    initial_W = np.asarray(
        np.random.uniform(
            low=-4 * np.sqrt(6. / (n_hidden + n_visible)),
            high=4 * np.sqrt(6. / (n_hidden + n_visible)),
            size=(n_visible, n_hidden)),
        dtype=theano.config.floatX)
    return theano.shared(value=initial_W, name='W', borrow=True)

    def init_bias(self, n):
        return theano.shared(value=np.zeros(n,dtype=theano.config.floatX),borrow=True)
    
    def create_auto(self, n_iter):
        x = T.fmatrix('x')  
        d = T.fmatrix('d')

        rng = np.random.RandomState(123)
        theano_rng = RandomStreams(rng.randint(2 ** 30))

        training_epochs = n_iter
        learning_rate = 0.1
        batch_size = 16
        first_dimension = min(50,int(self.tfidf_data_auto.shape[1]*0.7))
        # second_dimension = 50

        W1 = init_weights(self.tfidf_data_auto.shape[1], first_dimension)
        b1 = init_bias(first_dimension)
        b1_prime = init_bias(self.tfidf_data_auto.shape[1])
        W1_prime = W1.transpose() 
        # W2 = init_weights(first_dimension, second_dimension)
        # b2 = init_bias(second_dimension)
        # W2_prime = W2.transpose()
        # b2_prime = init_bias(first_dimension)

        y1 = T.nnet.sigmoid(T.dot(x, W1) + b1)
        # y2 = T.nnet.sigmoid(T.dot(y1,W2) + b2)
        # z2 = T.nnet.sigmoid(T.dot(y2, W2_prime) + b2_prime)
        z1 = T.nnet.sigmoid(T.dot(y1, W1_prime) + b1_prime)
        cost1 = T.sum((x-z1)**2)

        params1 = [W1, b1, b1_prime]
        grads1 = T.grad(cost1, params1)
        updates1 = [(param1, param1 - learning_rate * grad1)
                   for param1, grad1 in zip(params1, grads1)]
        self.train_auto = theano.function(inputs=[x], outputs = cost1, updates = updates1, allow_input_downcast = True)
        self.test_auto = theano.function(inputs=[x], outputs = y1, allow_input_downcast = True)
        self.upsampling_auto = theano.function(inputs=[y1], outputs = z1, allow_input_downcast = True)