In [1]:
from bs4 import BeautifulSoup
import bleach
import requests

# Wikipedia scraper from
# http://www.gyford.com/phil/writing/2015/03/25/wikipedia-parsing.php

class WikipediaFetcher(object):

    def fetch(self, page_name):
        """
        Passed a Wikipedia page's URL fragment, like
        'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
        main contents, tidy the HTML, strip out any elements we don't want
        and return the final HTML string.

        Returns a dict with two elements:
            'success' is either True or, if we couldn't fetch the page, False.
            'content' is the HTML if success==True, or else an error message.
        """
        result = self._get_html(page_name)
        
        if result['success']:
            result['content'] = self._tidy_html(result['content'])
            
        return result

    
    def _get_html(self, page_name):
        """
        Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
        the HTML content (not the entire HTML page) and returns it.

        Returns a dict with two elements:
            'success' is either True or, if we couldn't fetch the page, False.
            'content' is the HTML if success==True, or else an error message.
        """
        error_message = ''

        url = 'https:%s' % page_name

        try:
            response = requests.get(url, params={'action':'render'}, timeout=5)
        except requests.exceptions.ConnectionError as e:
            error_message = "Can't connect to domain."
        except requests.exceptions.Timeout as e:
            error_message = "Connection timed out."
        except requests.exceptions.TooManyRedirects as e:
            error_message = "Too many redirects."

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            # 4xx or 5xx errors:
            error_message = "HTTP Error: %s" % response.status_code
        except NameError:
            if error_message == '':
                error_message = "Something unusual went wrong."

        if error_message:
            return {'success': False, 'content': error_message} 
        else:
            return {'success': True, 'content': response.text}

    def _tidy_html(self, html):
        """
        Passed the raw Wikipedia HTML, this returns valid HTML, with all
        disallowed elements stripped out.
        """
        #html = self._bleach_html(html)
        #html = self._strip_html(html)
        return html

    def _bleach_html(self, html):
        """
        Ensures we have valid HTML; no unclosed or mis-nested tags.
        Removes any tags and attributes we don't want to let through.
        Doesn't remove the contents of any disallowed tags.

        Pass it an HTML string, it'll return the bleached HTML string.
        """

        # Pretty much most elements, but no forms or audio/video.
        allowed_tags = [
            'a', 'abbr', 'acronym', 'address', 'area', 'article',
            'b', 'blockquote', 'br',
            'caption', 'cite', 'code', 'col', 'colgroup',
            'dd', 'del', 'dfn', 'div', 'dl', 'dt',
            'em',
            'figcaption', 'figure', 'footer',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr',
            'i', 'img', 'ins',
            'kbd',
            'li',
            'map',
            'nav',
            'ol',
            'p', 'pre',
            'q',
            's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'sup',
            'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr',
            'ul',
            'var',
        ]

        # These attributes will be removed from any of the allowed tags.
        allowed_attributes = {
            '*':        ['class', 'id'],
            'a':        ['href', 'title'],
            'abbr':     ['title'],
            'acronym':  ['title'],
            'img':      ['alt', 'src', 'srcset'],
            # Ugh. Don't know why this page doesn't use .tright like others
            # http://127.0.0.1:8000/encyclopedia/5040/
            'table':    ['align'],
            'td':       ['colspan', 'rowspan'],
            'th':       ['colspan', 'rowspan', 'scope'],
        }

        return bleach.clean(html, tags=allowed_tags,
                                    attributes=allowed_attributes, strip=True)

    def _strip_html(self, html):
        """
        Takes out any tags, and their contents, that we don't want at all.
        And adds custom classes to existing tags (so we can apply CSS styles
        without having to multiply our CSS).

        Pass it an HTML string, it returns the stripped HTML string.
        """

        # CSS selectors. Strip these and their contents.
        selectors = [
            'div.hatnote',
            'div.navbar.mini', # Will also match div.mini.navbar
            # Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
            'div.topicon',
            'a.mw-headline-anchor',
        ]

        # Strip any element that has one of these classes.
        classes = [
            # "This article may be expanded with text translated from..."
            # https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
            'ambox-notice',
            'magnify',
            # eg audio on https://en.wikipedia.org/wiki/Bagpipes
            'mediaContainer',
            'navbox',
            'noprint',
        ]

        # Any element has a class matching a key, it will have the classes
        # in the value added.
        add_classes = {
            # Give these tables standard Bootstrap styles.
            'infobox':   ['table', 'table-bordered'],
            'ambox':     ['table', 'table-bordered'],
            'wikitable': ['table', 'table-bordered'],
        } 

        soup = BeautifulSoup(html)

        for selector in selectors:
            [tag.decompose() for tag in soup.select(selector)]

        for clss in classes:
            [tag.decompose() for tag in soup.find_all(attrs={'class':clss})]

        for clss, new_classes in add_classes.iteritems():
            for tag in soup.find_all(attrs={'class':clss}):
                tag['class'] = tag.get('class', []) + new_classes

        # Depending on the HTML parser BeautifulSoup used, soup may have
        # surrounding <html><body></body></html> or just <body></body> tags.
        if soup.body:
            soup = soup.body
        elif soup.html:
            soup = soup.html.body

        # Put the content back into a string.
        html = ''.join(str(tag) for tag in soup.contents)

        return html

In [2]:
myScraper = WikipediaFetcher()

original_sources = ["//en.wikipedia.org/wiki/List_of_machine_learning_concepts"
                    ,"//en.wikipedia.org/wiki/Data_mining"
                    ,"//en.wikipedia.org/wiki/Machine_learning"]

links = []
for page in original_sources:
    scraped_article = myScraper.fetch(page_name=page)
    if scraped_article['success'] == True:
        soup = BeautifulSoup(scraped_article['content'], 'html.parser')

        for a in soup.find_all('a', href=True):
            links.append(a['href'])
    else:
        print("failed")

In [3]:
import re 

links = list(set(links))

good_links = []
for link in links:
    if not(re.search('#|Special|php', link) or not re.search('wikipedia', link)):
        good_links.append(link)
        print str(link)

//en.wikipedia.org/wiki/Theoretical_computer_science
//en.wikipedia.org/wiki/Operational_data_store
//en.wikipedia.org/wiki/Security_service_(telecommunication)
//en.wikipedia.org/wiki/DBSCAN
//en.wikipedia.org/wiki/AODE
//en.wikipedia.org/wiki/Deeplearning4j
//en.wikipedia.org/wiki/Analysis_of_algorithms
//en.wikipedia.org/wiki/Robert_Tibshirani
//en.wikipedia.org/wiki/Handwriting_recognition
//en.wikipedia.org/wiki/Natural_selection
//en.wikipedia.org/wiki/Software_configuration_management
//en.wikipedia.org/wiki/Cognitive_model
//en.wikipedia.org/wiki/Hidden_Markov_model
//en.wikipedia.org/wiki/Machine_ethics
//en.wikipedia.org/wiki/Tanagra_(machine_learning)
//en.wikipedia.org/wiki/Intrusion_detection_system
//en.wikipedia.org/wiki/Naive_Bayes_classifier
//en.wikipedia.org/wiki/Mutation_(genetic_algorithm)
//en.wikipedia.org/wiki/Information_extraction
//en.wikipedia.org/wiki/Business_intelligence
//en.wikipedia.org/wiki/Q-learning
//en.wikipedia.org/wiki/Nearest_neighbor_(pattern_

In [4]:
document_set = {}

# Scrape each article
for article in good_links:
    myScraper = WikipediaFetcher()
    scraped_article = myScraper.fetch(page_name=article)
    
    # If succesfully scraped
    if scraped_article['success'] == True:
        
        # Use beautiful soup to parse HTML
        soup = BeautifulSoup(scraped_article['content'], 'html.parser')
        
        # FInd all p tags which seem to contain the body of the articles
        get_tags = soup.find_all(['p'])        
        
        document_text = ""        
        # For each tag found, strip extra characters and append to a document text string
        for tag in get_tags:    
            document_text = document_text + " " + (''.join(tag.findAll(text=True))).strip()                
        
        # If the document is of sufficient size
        if len(document_text) > 1000:                
            document_set[article]  = document_text
        else:
            print "----------------------------------------------------------------"
            print article + " -- was too short: "
            print ""
            print document_text
            print "----------------------------------------------------------------"
            print ""
            
    else:
            print "----------------------------------------------------------------"
            print article + " -- failed to scrape"
            print "----------------------------------------------------------------"
            print ""

----------------------------------------------------------------
//en.wikipedia.org/wiki/Journal_of_Machine_Learning_Research -- was too short: 

 The Journal of Machine Learning Research (usually abbreviated JMLR), is a scientific journal focusing on machine learning, a subfield of artificial intelligence. It was founded in 2000. The journal was founded as an open-access alternative to the journal Machine Learning. In 2001, forty editors of Machine Learning resigned in order to support JMLR, saying that in the era of the internet, it was detrimental for researchers to continue publishing their papers in expensive journals with pay-access archives. Instead, they wrote, they supported the model of JMLR, in which authors retained copyright over their papers and archives were freely available on the internet.[1] Print editions of JMLR were published by MIT Press until 2004, and by Microtome Publishing thereafter. Since Summer 2007 JMLR is also publishing Machine Learning Open Source Softw

In [5]:
import collections
from random import shuffle

dataset = document_set.values()

# Check for and remove duplicates 
dupes = [item for item, count in collections.Counter(dataset).items() if count > 1]

# Shuffle order for model training
if len(dupes) > 0:
    print "Duplicates found."
    dataset = list(set(dataset))
    shuffle(dataset)
else:
    shuffle(dataset)

Duplicates found.


In [6]:
### Tokenizer For Clustering ###

import nltk
import re

# Porter stemmer is a pre-trained stemmer that finds the most likely stem for an english word
from gensim.parsing import PorterStemmer
global_stemmer = PorterStemmer()

# Make a helper that will return the original form of the stem in future methods
class StemmingHelper(object):
    """
    Class to aid the stemming process - from word to stemmed form,
    and vice versa.
    The 'original' form of a stemmed word will be returned as the
    form in which its been used the most number of times in the text.
    """
 
    #This reverse lookup will remember the original forms of the stemmed words
    word_lookup = {}
 
    @classmethod
    def stem(cls, word):
        """
        Stems a word and updates the reverse lookup.
        """
 
        #Stem the word
        stemmed = global_stemmer.stem(word)
 
        #Update the word lookup
        if stemmed not in cls.word_lookup:
            cls.word_lookup[stemmed] = {}
        cls.word_lookup[stemmed][word] = (
            cls.word_lookup[stemmed].get(word, 0) + 1)
 
        return stemmed
 
    @classmethod
    def original_form(cls, word):
        """
        Returns original form of a word given the stemmed version,
        as stored in the word lookup.
        """
 
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                       key=lambda x: cls.word_lookup[word][x])
        else:
            return word

# Remove unicode, normalizing it to ascii
import unicodedata

# Custom tokenizer which produces sentence-wise tokens
def sentence_tokenize(doc):
        doc = unicodedata.normalize('NFKD', doc).encode('ascii','ignore')
        doc = doc.lower()
        doc = re.sub('[-—]', ' ', doc)
        doc = re.sub('[\(\[].*?[\)\]]', ' ', doc)
        doc = re.sub(' +',' ', doc)
                
        sentences = nltk.sent_tokenize(doc) 
        
        sentence_token_list = []
        for sentence in sentences:
            # Filter sentences with less than 5 words since they won't provide much context
            if len(sentence) > 4:                
                sentence_token_list.append([
                        StemmingHelper.stem(word) 
                        for word in nltk.word_tokenize(sentence)
                        # Filter words with no characters since they usually are numeric gibberish
                        if len(re.sub('[^a-z]', '', word)) > 0
                    ])       

        return sentence_token_list

In [7]:
# doc = document_set['//en.wikipedia.org/wiki/Natural_language_processing']

# import unicodedata
# import nltk

# doc = unicodedata.normalize('NFKD', doc).encode('ascii','ignore')
# doc = doc.lower()
# doc = re.sub('[-—]', ' ', doc)
# doc = re.sub('[\(\[].*?[\)\]]', ' ', doc)
# doc = re.sub(' +',' ', doc)

# sentences = nltk.sent_tokenize(doc) 

# for sentence in sentences:
#     print sentence
#     print ""

In [8]:
# Build massive list of  sentences that are tokenized
corpus = []
for text in dataset:
    corpus.extend(sentence_tokenize(text))

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.models import Word2Vec

In [10]:
dimension = 250
model = Word2Vec(corpus, size=dimension, window=4, min_count=5, sg=1, sample=0.001, workers=4, iter=1000)

In [11]:
vocab = list(model.vocab.keys())
print len(vocab)

6829


In [12]:
test = "vector cluster statistic model deep neural network train supervise unsupervise"

for stem in (StemmingHelper.stem(word) for word in test.split()):
    print "Similar to: " + StemmingHelper.original_form(stem)
    for similar in model.most_similar(stem)[0:5]:
        print StemmingHelper.original_form(similar[0])
    print "------------------------------------------"    
print ""
print ""

Similar to: vector
hyperplane
matrix
nearest
weights
dimensional
------------------------------------------
Similar to: clustering
subspace
centroid
linkage
nearest
hierarchical
------------------------------------------
Similar to: statistical
mathematical
bayesian
proposition
multivariate
kolmogorov
------------------------------------------
Similar to: model
predictive
approach
methods
fit
markov
------------------------------------------
Similar to: deep
feedforward
learning
neural
supervised
backpropagation
------------------------------------------
Similar to: neural
network
recurrent
feedforward
artificial
deep
------------------------------------------
Similar to: network
neural
connected
layer
routers
internetworking
------------------------------------------
Similar to: training
learning
unseen
input
reward
rnn
------------------------------------------
Similar to: supervised
unsupervised
learning
semi
transduction
deep
------------------------------------------
Similar to: u

In [13]:
import tensorflow as tf
import numpy as np

class SOM(object):
    """
    2-D Self-Organizing Map
    - Euclidean metric
    - Gaussian Neighbourhood function
    - Linearly decreasing learning rate
    """
 

    #To check if the SOM has been trained
    _trained = False
 
    def __init__(self, m, n, dim, n_iterations=100, alpha=None, sigma=None):
        """
        Initializes all components of the TensorFlow Graph.
        - m X n are the dimensions of the SOM
        - 'n_iterations' is an integer denoting # iterations for training
        - 'dim' is the dimensionality of the training inputs
        - 'alpha' is a number denoting the initial learning rate
            >> default = 0.3
        - 'sigma' is the the initial neighbourhood value
            >> the radius of influence of the BMU while training
            >> default = max(m, n) / 2.0
        """
 
        self._m = m
        self._n = n
        self._n_iterations = abs(int(n_iterations))

        if alpha is None:
            alpha = 0.3
        else:
            alpha = float(alpha)
            
        if sigma is None:
            sigma = max(m, n) / 2.0
        else:
            sigma = float(sigma)
            
        # Normalizing constant for decay of alpha
        time_constant = self._n_iterations/np.log(sigma)
            
 
        #Initialize Graph
        self._graph = tf.Graph()
 
        #Populate graph
        with self._graph.as_default():
            ##Placeholders to be fed in during training 
            
            #The training vector
            self._vect_input = tf.placeholder("float", [dim])
            #Iteration number
            self._iter_input = tf.placeholder("float")
 
            #Randomly initialized weights for neurons
            #Stored as a matrix variable of shape [m*n, dim]
            self._weightage_vects = tf.Variable(tf.random_normal(
                [m*n, dim]))
 
            #Grid locations of neurons
            self._location_vects = tf.constant(np.array(
                list(self._neuron_locations(m, n))))
 

            #Compute the Best Matching Unit given a vector
            #argmin of Euclidean distance b/w weights and given input
            bmu_index = tf.argmin(tf.sqrt(tf.reduce_sum(
                tf.pow(tf.sub(self._weightage_vects, tf.pack(
                    [self._vect_input for i in range(m*n)])), 2), 1)),
                                  0)
 
            #Extract locations
            slice_input = tf.pad(tf.reshape(bmu_index, [1]),
                                 np.array([[0, 1]]))
            bmu_loc = tf.reshape(tf.slice(self._location_vects, slice_input,
                                          tf.constant(np.array([1, 2]))),
                                 [2])
 
            #Adjust alpha and sigma values based on iteration
            learning_rate_op = tf.exp(tf.neg(tf.div(self._iter_input,
                                                  time_constant))) 
        
            _alpha_op = tf.mul(alpha, learning_rate_op)
            _sigma_op = tf.mul(sigma, learning_rate_op)
 
            #Generate learning rate for all neurons
            #Based on iteration number and location wrt BMU
            bmu_distance_squares = tf.reduce_sum(tf.pow(tf.sub(
                self._location_vects, tf.pack(
                    [bmu_loc for i in range(m*n)])), 2), 1)
            
            neighbourhood_func = tf.exp(tf.neg(tf.div(tf.cast(
                bmu_distance_squares, "float32"), tf.pow(_sigma_op, 2))))
            
            learning_rate_op = tf.mul(_alpha_op, neighbourhood_func)
 
            #Update weight vectors of all neurons
            learning_rate_multiplier = tf.pack([tf.tile(tf.slice(
                learning_rate_op, np.array([i]), np.array([1])), [dim])
                                               for i in range(m*n)])
            weightage_delta = tf.mul(
                learning_rate_multiplier,
                tf.sub(tf.pack([self._vect_input for i in range(m*n)]),
                       self._weightage_vects))                                         
            
            new_weightages_op = tf.add(self._weightage_vects,
                                       weightage_delta)
            
            self._training_op = tf.assign(self._weightage_vects,
                                          new_weightages_op)                                       
 
            #Initialize session and variables
            self._sess = tf.Session() 
            init_op = tf.initialize_all_variables()
            self._sess.run(init_op)
 
    def _neuron_locations(self, m, n):
        """
        Yields one by one the 2-D locations of the individual neurons
        in the SOM.
        """
        for i in range(m):
            for j in range(n):
                yield np.array([i, j])
 
    def train(self, input_vects):
        """
        Trains the SOM.
        'input_vects' should be an iterable of 1-D NumPy arrays with
        dimensionality as provided during initialization of this SOM
        """
 
        #Training iterations
        for iter_no in range(self._n_iterations):
            #Train with each vector one by one
            for input_vect in input_vects:
                self._sess.run(self._training_op,
                               feed_dict={self._vect_input: input_vect,
                                          self._iter_input: iter_no})
 
        #Store a centroid grid for easy retrieval later on
        centroid_grid = [[] for i in range(self._m)]
        self._weightages = list(self._sess.run(self._weightage_vects))
        self._locations = list(self._sess.run(self._location_vects))
        for i, loc in enumerate(self._locations):
            centroid_grid[loc[0]].append(self._weightages[i])
        self._centroid_grid = centroid_grid
 
        self._trained = True
 
    def get_centroids(self):
        """
        Returns a list of 'm' lists, with each inner list containing
        the 'n' corresponding centroid locations as 1-D NumPy arrays.
        """
        if not self._trained:
            raise ValueError("SOM not trained yet")
        return self._centroid_grid
 
    def map_vects(self, input_vects):
        """
        Maps each input vector to neuron in the SOM grid.
               
        Returns a list of 1-D NumPy arrays containing (x, y) for each input.
        """
 
        if not self._trained:
            raise ValueError("SOM not trained yet")
 
        to_return = []
        for vect in input_vects:
            min_index = min([i for i in range(len(self._weightages))],
                            key=lambda x: np.linalg.norm(vect-
                                                         self._weightages[x]))
            to_return.append(self._locations[min_index])
 
        return to_return

In [14]:
#For plotting the images
from matplotlib import pyplot as plt

In [15]:
from nltk.corpus import stopwords
myStop = stopwords.words('english')

filtered_vocab = list(set(vocab)-set([global_stemmer.stem(x) for x in myStop]))

In [16]:
words = []
for sentence in corpus:
    words.extend([x for x in sentence if x in filtered_vocab])

from collections import Counter
word_counts = Counter(words)

In [17]:
short_list = word_counts.most_common(200)
for wordCount in short_list:
    print StemmingHelper.original_form(wordCount[0])

used
data
computer
system
model
's
also
information
learning
algorithm
may
program
time
general
include
function
example
network
process
set
development
problem
software
based
language
methods
different
many
research
machine
number
operating
two
states
first
value
applications
provide
new
user
design
called
variables
distribution
analysis
work
results
required
make
probability
term
allows
form
would
access
type
often
technology
statistical
case
means
training
performance
however
given
specific
related
like
features
point
following
communication
approach
code
defined
database
product
united
control
possible
measure
created
well
large
theory
object
intelligence
market
way
structure
field
company
business
input
source
order
known
predictive
human
science
support
standard
need
test
services
represent
techniques
since
group
level
engineering
published
internet
implementation
described
class
change
complex
security
open
management
space
study
organization
digital
search
web
rules
linear
refe

In [18]:
# short_list = [StemmingHelper.stem(x) for x in 'algorithm activation approximation architecture artificial assumptions bayesian class classification classifier clustering conditional connected converge convolutional data decision deep density dependent dimensional distance distribution error estimate expected features fit function gene gradient hidden independent information iterative latent layer learning likelihood linear local logistic machine map markov matrix maximum means measure memory minimize model network neural neurons nodes normal optimal output parameters pattern perceptron prediction probability random recognition regression represent representation rule space squared standard state statistical structure supervised support test training tree variables variance vector weights'
#               .split()]

short_list = list(set(short_list))
print len(short_list)

85


In [19]:
vector_list = []

for x in short_list:
    vector_list.append(model[x])

In [20]:
vectors = np.vstack(vector_list)

In [33]:
#Training inputs

grid_size = 8

som = SOM(m=grid_size, n=int(1.2*grid_size), dim=dimension, n_iterations=500)
som.train(vector_list)

In [45]:
#Map to their closest neurons
mapped = som.map_vects(vector_list)

mapped_flat = {}
for i, m in enumerate(mapped):
    if not (m[0] in mapped_flat):
        mapped_flat[m[0]] = {}
    if not (m[1] in mapped_flat[m[0]]):
        mapped_flat[m[0]][m[1]] = StemmingHelper.original_form(short_list[i])
    else:
        mapped_flat[m[0]][m[1]] = mapped_flat[m[0]][m[1]] + "; " + StemmingHelper.original_form(short_list[i])
        print "overlap found"

overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found
overlap found


In [54]:
#Get output grid
grid = som.get_centroids()

centroids = []
for m_list in grid:
    for centroid in m_list:
        centroids.append(centroid)

indexes = []
for x in mapped_flat:
    for y in mapped_flat[x]:
        indexes.append(y*x)
indexes = list(set(indexes))

mapped_exist = {}
for index in range(0, len(centroids)):
    if index in indexes:
        if not index in mapped_exist:
            mapped_exist[index] = centroids[index]

to_reduce = []
for x in mapped_exist:
    to_reduce.append(mapped_exist[x])

In [55]:
""" Deep Auto-Encoder implementation
	
	An auto-encoder works as follows:

	Data of dimension k is reduced to a lower dimension j using a matrix multiplication:
	softmax(W*x + b)  = x'
	
	where W is matrix from R^k --> R^j

	A reconstruction matrix W' maps back from R^j --> R^k

	Our reconstruction function is softmax'(W' * x' + b') 

	Now the point of the auto-encoder is to create a reduction matrix (values for W, b) 
	that is "good" at reconstructing  the original data. 

	Thus we want to minimize  ||softmax'(W' * (softmax(W *x+ b)) + b')  - x||

	A deep auto-encoder is nothing more than stacking successive layers of these reductions.
"""
import math
import random

def create(x, layer_sizes):

	# Build the encoding layers
	next_layer_input = x

	encoding_matrices = []
	for dim in layer_sizes:
		input_dim = int(next_layer_input.get_shape()[1])

		# Initialize W using random values in interval [-1/sqrt(n) , 1/sqrt(n)]
		W = tf.Variable(tf.random_uniform([input_dim, dim], -1.0 / math.sqrt(input_dim), 1.0 / math.sqrt(input_dim)))

		# Initialize b to zero
		b = tf.Variable(tf.zeros([dim]))

		# We are going to use tied-weights so store the W matrix for later reference.
		encoding_matrices.append(W)

		output = tf.nn.tanh(tf.matmul(next_layer_input,W) + b)

		# the input into the next layer is the output of this layer
		next_layer_input = output

	# The fully encoded x value is now stored in the next_layer_input
	encoded_x = next_layer_input

	# build the reconstruction layers by reversing the reductions
	layer_sizes.reverse()
	encoding_matrices.reverse()


	for i, dim in enumerate(layer_sizes[1:] + [ int(x.get_shape()[1])]) :
		# we are using tied weights, so just lookup the encoding matrix for this step and transpose it
		W = tf.transpose(encoding_matrices[i])
		b = tf.Variable(tf.zeros([dim]))
		output = tf.nn.tanh(tf.matmul(next_layer_input,W) + b)
		next_layer_input = output

	# the fully encoded and reconstructed value of x is here:
	reconstructed_x = next_layer_input

	return {
		'encoded': encoded_x,
		'decoded': reconstructed_x,
		'cost' : tf.sqrt(tf.reduce_mean(tf.square(x-reconstructed_x)))
	}

In [70]:
sess = tf.Session()

start_dim = dimension
x = tf.placeholder("float", [None, start_dim])

autoencoder = create(x, [100, 60, 30, 15, 8, 3, 2])

init = tf.initialize_all_variables()
sess.run(init)

train_step = tf.train.GradientDescentOptimizer(0.5).minimize(autoencoder['cost'])

for i in range(25000):
	sess.run(train_step, feed_dict={x: np.array(to_reduce)})
    
encoded = sess.run(autoencoder['encoded'], feed_dict={x: np.array(to_reduce)})

In [71]:
reduced = {}

loop = 0
for point in mapped_exist:
    reduced[point] = encoded[loop]
    loop = loop + 1
    
# print reduced

min_x = 0
max_x = 0
min_y = 0
max_y = 0
# min_z = 0
# max_z = 0

for point in reduced:
    coordinate = reduced[point]
    
    if coordinate[0] < min_x:
        min_x = coordinate[0]
    elif coordinate[0] > max_x:
        max_x = coordinate[0]
        
    if coordinate[1] < min_y:
        min_y = coordinate[1]
    elif coordinate[1] > max_y:
        max_y = coordinate[1]
        
#     if coordinate[2] < min_z:
#         min_z = coordinate[2]
#     elif coordinate[2] > max_z:
#         max_z = coordinate[2]

In [91]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as pyplot
import pylab

fig = pylab.figure(figsize=(16, 20))
ax = Axes3D(fig)

spread = 0
for x in mapped_flat:
    for y in mapped_flat[x]:
        coordinates = reduced[x*y]
        ax.text(coordinates[0], coordinates[1], spread,
                mapped_flat[x][y], size=6)
        spread = spread + 0.01

pyplot.xlim([min_x, 1.1*max_x])
pyplot.ylim([min_y, max_y])
ax.set_zlim(0, spread)

ax.set_xticks([])                               
ax.set_yticks([])                               
ax.set_zticks([])
pyplot.savefig('SOM2.pdf')

In [88]:
plt.show()