In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm as progressbar
from collections import defaultdict

from dask.distributed import Client
from dask.distributed import progress
client = Client()

import nyata2017

# Big Data testközelből


<i>Fülöp András - Data Scientist @ Lensa | Gitential</i>


<b>NYATA2017</b>, 2017. július 22.

# Python

> *A programozás a legközelebbi dolog a szupererőhöz.*  
> Drew Houston (Dropbox)

In [None]:
print "Hello NYATA!"

# Hogyan dolgozunk fel adatokat?

<img src="pics/meklogo.gif" source="http://mek.oszk.hu"/>

In [None]:
bookpaths = nyata2017.list_books('./docs/')
len(bookpaths)

In [None]:
with open(bookpaths[0]) as bookfile:
    print bookfile.readline()[:140]

In [None]:
books = []
for bookpath in progressbar(bookpaths[:1000]):
    with open(bookpath) as bookfile:
        books.append(bookfile.readline())

In [None]:
nyata2017.mean(books)

In [None]:
nyata2017.plot_histogram(books)

## Melyek a MEK leggyakoribb szavai?

In [None]:
wordcount = defaultdict(int)

for book in progressbar(books):
    for word in book.split():
        wordcount[word.lower()] += 1

nyata2017.print_most_common(wordcount)

# Mit tehetünk, ha nincs elég lóerő?

Átlagszámítás egy `20.000 x 20.000` mátrixon (400 Millió adatpont ~4 GB adat).

In [None]:
%%time
x = nyata2017.generate_random_matrix(size=(20000, 20000))
y = nyata2017.mean(x)

## Párhuzamosítsunk!

<img src="pics/embarrassing.gif" source="https://github.com/dask/dask-tutorial" width="800px"/>

In [None]:
%%time
x = nyata2017.generate_random_matrix(size=(20000, 20000), distribute=True)
y = nyata2017.mean(x)
y.compute()

# Skálázás

<img src="pics/verticalvshorizontal.png" source="http://www.pc-freak.net/blog/vertical-horizontal-server-services-scaling-vertical-horizontal-hardware-scaling/" />

# Hogyan dolgozzunk fel Big Data-t?

## A MapReduce paradigma

<img src="pics/mapreduce.png" source="https://wikis.nyu.edu/display/NYUHPC/Big+Data+Tutorial+1%3A+MapReduce" width="750px"/>

In [None]:
books = nyata2017.load_books('./docs/00*.txt')
splitted = (books.str.lower()
                 .str.split()
                 .flatten())
mapped = splitted.frequencies()
reduced = mapped.topk(10, lambda x: x[1])

In [None]:
wordcount = client.compute(reduced)
progress(wordcount)

In [None]:
results = client.gather(wordcount)
nyata2017.print_most_common(results)
client.close()

## Alternatívák

<img src="pics/solutions.jpg" source1="http://www.tomsitpro.com/articles/mesos-mesosphere-data-center-open-source-apache,1-2001.html" source2="https://www.slideshare.net/PowerPoint-Templates/computers-and-servers-powerpoint-presentation-slides-ppt-templates" width="600px"/>


# Milyen lehetőségek nyílnak meg a Big Data-n keresztül?

# Kitérő: Neurális hálózatok

## Perceptron model
<img src="pics/neuron.png" width=400 align="left"/>

## Többrétegű hálózat
<img src="pics/mlp.png" width=400 align="left" source="https://github.com/nikolaypavlov/MLPNeuralNet">

# Mi az a mélytanulás?

<img src="pics/mgc.gif" align="middle"/>

<img src="pics/deepdream.jpg" source="https://artofericwayne.com/2015/07/08/google-deep-dream-getting-too-good/"/>

<img src="pics/deeprebrandt.jpg" source="http://www.wired.co.uk/article/new-rembrandt-painting-computer-3d-printed"/>

<img src="pics/deeptransform.jpg" source="https://deepart.io/"/>

> *PANDARUS:*  
> *Alas, I think he shall be come approached and the day*  
> *When little srain would be attain'd into being never fed,*  
> *And who is but a chain and subjects of his death,*  
> *I should not sleep.*  
> ...  
>   
> *Clown:*  
> *Come, sir, I will make did behold your worship.*  
>   
> *VIOLA:*  
> *I'll drink it.*  

\- William RNNspear

<img src="pics/deeppaper.jpg" source="http://karpathy.github.io/2015/05/21/rnn-effectiveness/">

<img src="pics/w2v-context-words.png" source="https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/">

<img src="pics/w2v-king-queen-vectors.png" source="https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/" align="left" width="400px">

<img src="pics/w2v-king-queen-composition.png" source="https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/" align="right" width="400px">

<img src="pics/gan_cats.gif" source="https://github.com/AlexiaJM/Deep-learning-with-cats">

<img src="pics/inspiroquotes.jpg" source="http://inspirobot.me/">

# Mik azok az ajánló rendszerek?

<img src="pics/logos.png">

# Collaborative filtering

<img src="pics/cfinteract.png" />

<img src="pics/cfmatrix.png" />

# Tartalom alapú módszerek 

<img src="pics/cbinteract.png" />

# A nagy testvér figyel?

# Köszönöm megtisztelő figyelmüket!

<img src="pics/twitter.png" align='left' style='margin-right: 5px; margin-left: 15px' /><b>twitter.com/fulibacsi</b><br>

<img src="pics/github.png" align='left' style='margin-right: 5px; margin-left: 15px' /><b>github.com/fulibacsi</b>

<img src="pics/qr.jpg" align='right' width="200px">

**Források:**

- Dokumentumok:
    - mek.oszk.hu


- Adatfeldolgozás képek:
    - párhuzamos feldolgozás: https://github.com/dask/dask-tutorial
    - vertikális vs horizontális skálázás: http://www.pc-freak.net/blog/vertical-horizontal-server-services-scaling-vertical-horizontal-hardware-scaling/
    - mapreduce paradigma: https://wikis.nyu.edu/display/NYUHPC/Big+Data+Tutorial+1%3A+MapReduce
    - commodity hardverek: http://www.tomsitpro.com/articles/mesos-mesosphere-data-center-open-source-apache,1-2001.html
    - szuperszámítógép: https://www.slideshare.net/PowerPoint-Templates/computers-and-servers-powerpoint-presentation-slides-ppt-templates


- Deep learning képek:
    - deep dream: https://artofericwayne.com/2015/07/08/google-deep-dream-getting-too-good/
    - Rembrandt 3d nyomtatás: http://www.wired.co.uk/article/new-rembrandt-painting-computer-3d-printed
    - Képek átalakítása festői stílusokban: https://deepart.io/
    - Generatív nyelvek: http://karpathy.github.io/2015/05/21/rnn-effectiveness/
    - word2vec: https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/
    - Deep Learning cats: https://github.com/AlexiaJM/Deep-learning-with-cats
    - Inspirobot: http://inspirobot.me/

**Részletekről bővebben:**

- Python nyelv kezdőknek: http://mek.oszk.hu/08400/08435/08435.pdf
- Dask tutorial: https://github.com/dask/dask-tutorial
- Word2Vec kipróbálható online alkalmazással:  https://rare-technologies.com/word2vec-tutorial/
- Word2Vec tutorial:  http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/