In [1]:
# n-gram models
import re 
from bs4 import BeautifulSoup
import requests
import data_analysis_tools as da

def fix_unicode(text):
    return text.replace(u"\u2019", "'")

url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text

sp = BeautifulSoup(html, 'html5lib')

content = sp.find('div', 'entry-content')
regex = r"[\w']+|[\.]"

document = []

for p in content('p'):
    words = re.findall(regex, fix_unicode(p.text))
    document.extend(words)

# continue on pg 338

In [2]:
# word pairs
from collections import defaultdict
bigrams = zip(document, document[1:])
transitions = defaultdict(list)

for prev, current in bigrams:
    transitions[prev].append(current)

print(transitions['government'])
"""this means 'data' and 'you' are the 2 words which follows 'government' in this article."""

['data', 'your']


"this means 'data' and 'you' are the 2 words which follows 'government' in this article."

In [3]:
# generating sentences
def generate_using_bigrams(transitions):
    current = "." # this means the next word will start a sentence
    result = []
    while True:
        next_word_candidates = transitions[current]
        current = da.random.sample(next_word_candidates)
        result.append(current)
        if current == ".":
            return " ".join(result)

print('5 Random Bullshit Sentences ->')
print('\n   -', generate_using_bigrams(transitions))
print('\n   -', generate_using_bigrams(transitions))
print('\n   -', generate_using_bigrams(transitions))
print('\n   -', generate_using_bigrams(transitions))
print('\n   -', generate_using_bigrams(transitions))

5 Random Bullshit Sentences ->

   - 01 each .

   - Whether you have is try plotting it goes .

   - Whether that person's identity using Google is one of a sense of the open source and enormous datasets which are several it's revolutionary CDDB realized that people and made yourself you simply ignore the classic for the data platforms or studying the job only costs 100 .

   - We've all heard a large collection tools for an allure but it easier to parse the companies using Yahoo to understanding the Cassandra jobs and Python design a lot about data science .

   - We're discussing here .


In [4]:
trigrams = zip(document, document[1:], document[2:])
trigram_transitions = defaultdict(list)
starts = []

for prev, current, next in trigrams:
    if prev == ".":
        starts.append(current)
    trigram_transitions[(prev, current)].append(next)

trigram_transitions[('successful', 'businesses')]

['will']

In [5]:
def generate_using_trigrams(transitions, starts):

    current = da.random.sample(starts)
    prev = "."
    result = [current]

    while True:
        next_word_candidates = transitions[(prev, current)]
        next_word = da.random.sample(next_word_candidates)
        
        prev, current = current, next_word
        result.append(current)

        if current == ".":
            return " ".join(result)

print('5 Random Sentences ->')
print('\n   -', generate_using_trigrams(trigram_transitions, starts))
print('\n   -', generate_using_trigrams(trigram_transitions, starts))
print('\n   -', generate_using_trigrams(trigram_transitions, starts))
print('\n   -', generate_using_trigrams(trigram_transitions, starts))
print('\n   -', generate_using_trigrams(trigram_transitions, starts))

"""They seem much more realistic compare to biagrams tho."""

5 Random Sentences ->

   - The result was a valuable data product that analyzed a huge mountain of data science .

   - ly is generating and find out just how bad your data sort the distractions from the data were insufficient .

   - If you've ever used iTunes to rip a CD you've taken advantage of this data would be useless if we couldn't store it .

   - Scripting languages such as Perl and Python are essential .

   - Data is indeed the new Intel Inside .


'They seem much more realistic compare to biagrams tho.'

In [6]:
# grammars
grammar = {
    "_S": ["_NP _VP"],
    "_NP": ["_N", "_A _NP _P _A _N"],
    "_VP": ["_V", "_V _NP"],
    "_N": ["data science", "Python", "regression"],
    "_A": ["big", "linear", "logistic"],
    "_P": ["about", "near"],
    "_V": ["learns", "trains", "tests", "is"]
}

# a sentence is created like this ->
* ['_S']
* ['_NP','_VP']
* ['_N','_VP']
* ['Python','_VP']
* ['Python','_V','_NP']
* ['Python','trains','_NP']
* ['Python','trains','_A','_NP','_P','_A','_N']
* ['Python','trains','logistic','_NP','_P','_A','_N']
* ['Python','trains','logistic','_N','_P','_A','_N']
* ['Python','trains','logistic','data science','_P','_A','_N']
* ['Python','trains','logistic','data science','about','_A', '_N']
* ['Python','trains','logistic','data science','about','logistic','_N']
* ['Python','trains','logistic','data science','about','logistic','Python']

In [7]:
def is_terminal(token):
    return token[0] != "_"

def expand(grammar, tokens):
    
    for i, token in enumerate(tokens):

        # skip over terminals
        if is_terminal(token):
            continue

        # if we get here, we found a non-terminal token
        # so we need to choose a replacement randomly

        replacement = da.random.sample(grammar[token])

        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
        
        # call expand on the new list of tokens
        return expand(grammar, tokens)
        # if we get here we had all terminals and nothing more to do.
    return tokens

def generate_sentence(grammar):
    return expand(grammar, ["_S"])

In [8]:
" ".join(generate_sentence(grammar))

'data science tests data science'

In [9]:
# gibbs sampling

def roll_dice():
    return da.random.rand_index(6) + 1

def direct_sample():
    d1 = roll_dice()
    d2 = roll_dice()
    return d1, d1 + d2

def random_y_given_x(x):
    """equally likely to be x + 1, x + 2, ... , x + 6"""
    return x + roll_dice()

# other direction is more complicated
# For example, if you know that y is 2, then necessarily x is 1 
# since the only way two dice can sum to 2 is if both of them are 1

def random_x_given_y(y):
    if y <= 7:
        # if total is 7 or less, the first dice equally likely to be
        # 1, 2, 3, .. total - 1
        return da.random.rand_index(y - 1) + 1
    else:
        return da.random.sample(range(y - 6, 7))
        # y = 8 -> x >= 2
        # y = 9 -> x >= 3
        # ...

In [10]:
random_x_given_y(7)

5

In [11]:
def gibbs_sample(num_iters=100):
    x, y = 1, 2 # doesnt matter
    for _ in range(num_iters):
        x = random_x_given_y(y)
        y = random_y_given_x(x)
    return x, y

# ???

gibbs_sample(100)

(5, 6)

In [12]:
def compare_distributions(num_samples=1000):
    counts = defaultdict(lambda: [0, 0])
    for _ in range(num_samples):
        counts[gibbs_sample()][0] += 1
        counts[direct_sample()][1] += 1
    return counts
compare_distributions(10)

defaultdict(<function __main__.compare_distributions.<locals>.<lambda>()>,
            {(3, 5): [1, 0],
             (6, 12): [0, 1],
             (3, 9): [1, 0],
             (5, 10): [0, 1],
             (4, 10): [1, 1],
             (4, 6): [0, 3],
             (5, 9): [1, 0],
             (1, 6): [2, 1],
             (6, 7): [0, 1],
             (6, 10): [1, 0],
             (1, 2): [1, 0],
             (4, 5): [1, 1],
             (1, 3): [0, 1],
             (3, 8): [1, 0]})

In [13]:
from collections import Counter
def sample_from(weights):
    """returns i with probability weights[i] / sum(weights)"""
    total = sum(weights)
    rnd = total * da.random.random()
    for i, w in enumerate(weights):
        rnd -= w
        if rnd <= 0: return i

sample_array = [1, 1, 3]
sample_size = 10000

rands = Counter([sample_from(sample_array) for _ in range(sample_size)])
probs = defaultdict(list)
probs = [(y, x / sample_size) for y, x in rands.items()]
print(probs)

[(2, 0.5932), (0, 0.2068), (1, 0.2)]


In [26]:
import data_analysis_tools as da
from collections import Counter, defaultdict
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [27]:
K = 4

# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]

# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]

# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]

# a list of numbers, one for each document
document_lengths = list(map(len, documents))

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

In [28]:
W

36

In [29]:
"""
    For example, once we populate these, we can find, for example, the number of words in
    documents[3] associated with topic 1 as
"""
document_topic_counts[3][1]

"""
    And we can find the number of times nlp is associated with topic 2 as:
"""
topic_word_counts[2]["nlp"]

0

In [30]:
def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document d that are assigned to topic (plus some smoothing)"""
    return (
        (document_topic_counts[d][topic] + alpha) / (document_lengths[d] + K * alpha)
    )

def p_word_given_topic(word, topic, beta=0.1):
    # belirli bir topigin icerisindeki belirli word oranı
    return (
        (topic_word_counts[topic][word] + beta) / (topic_counts[topic] + W * beta)
    )

def topic_weight(d, word, k):
    """given a document and a word in that document, return the weight for the kth topic"""
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k) for k in range(K)])


In [31]:
da.r.seed(0)
document_topics = [[da.random.sample(range(K)) for word in document] for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])):

            # remove this word/topic from the counts
            # so that it doesnt influence the weights
            if topic is None:
                continue

            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            if new_topic is None:
                continue

            # add +1 again
            document_topic_counts[d][topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [33]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0: print(k, word, count)

0 Java 2
0 Python 2
0 probability 2
0 Hadoop 1
0 Haskell 1
0 Mahout 1
0 Spark 1
0 scikit-learn 1
0 scipy 1
0 programming languages 1
0 libsvm 1
1 C++ 2
1 Big Data 2
1 deep learning 2
1 artificial intelligence 2
1 mathematics 1
1 theory 1
2 machine learning 2
2 regression 2
2 statsmodels 2
2 Postgres 1
2 libsvm 1
2 decision trees 1
2 Python 1
3 R 3
3 neural networks 2
3 statistics 2
3 pandas 2
3 Storm 1
3 Cassandra 1
3 scikit-learn 1
3 databases 1
3 HBase 1
3 MySQL 1
3 MongoDB 1
3 support vector machines 1
3 numpy 1
