In [2]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from collections import OrderedDict
import numpy as np
import argparse
import csv
import urllib
import urllib2
import unicodedata
import string
import nltk
import sys

In [3]:
"""All lexical representations are to be used is the stemmed representation by the Porter stemmer"""
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [4]:
"""Tokenizes and performs stemming on the tokens."""
def tokenize(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [5]:
"""Convert unicode to ascii, removes accents."""
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')            

In [6]:
"""Prepares for centroid calculation: 
- Calculate L2 norm for each row
- Normalize the row by L2 norm"""
def tfidf_normalize(row):
    # Calculate L2 norm value.
    l2norm = np.linalg.norm(row, 2)
    
    # Normalize row by L2 norm.
    return row/l2norm

In [7]:
"""Takes in the sparse matrix"""
def get_query_expansion_vector(vec):
    # Calculate the vector.
    normalized = np.apply_along_axis(tfidf_normalize, axis=1, arr=vec)
    centroid = np.sum(normalized, axis=0)
    qe = centroid/np.linalg.norm(centroid, 2)
    return qe

In [19]:
"""Retrieves html result for a query pushed into the Google search engine."""
def get_query_html(query, limit):
    address = "http://www.google.com/search?q=%s&num=100&hl=en&start=0" % (urllib.quote_plus(query))
    request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
    urlfile = urllib2.urlopen(request)
    page = urlfile.read()
    
    # Determine the amount of time needed to sleep
    # before we yield control.
    sleep_time = 3600/limit
    sleep(randint(sleep_time, sleep_time+5))
    return page

In [9]:
"""Retrives the tf-idf matrix for a query and candidate."""
def get_tfidf_matrices(es_q, es_c):
    lq = len(es_q)
    lc = len(es_c)
    
    # Combine the document sets for tf-idf calculation.
    combined = es_q
    combined.extend(es_c)
    
    # We want to get a new vectorizer for every string.
    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
    tfs = tfidf.fit_transform(combined)
    
    vectors = tfs.toarray()
    return vectors[0:lq-1, :], vectors[lq:lq+lc-1, :]

In [10]:
"""Calculates the kernel function value from two expansion sets rather
than raw short string candidates, makes caching easier."""
def kval_es(es_q, es_c):
    vq, vc = get_tfidf_matrices(es_q, es_c)
    qe_q = get_query_expansion_vector(vq)
    qe_c = get_query_expansion_vector(vc)
    return np.inner(qe_q, qe_c)

In [18]:
"""Finds the value of the kernel function between 
query string q and candidate c, betther not use this,
cause this is costly. Use the extended set version when
handling bulk. Use this for checking or debugging."""
def kval(q, c):
    q_page = get_query_html(q, 1200)
    c_page = get_query_html(c, 1200)
    es_q = google_expanded_docs(q_page)
    es_c = google_expanded_docs(c_page)
    return kval_es(es_q, es_c)

In [12]:
"""Retrieves the related searches for a Google query string,
given the html page of the google search page."""
def google_related_searches(page):    
    rs = list()        
    if page is not None:
        soup = BeautifulSoup(page, 'lxml')        
        # Strip the related search portion using beautifulsoup.    
        rsdiv = soup.find("div", { "id" : "brs" })
        for d in rsdiv.findAll('div', {'class':'brs_col'}):
            for p in d.findAll('p', {'class':'_e4b'}):
                rs.append(to_ascii(p.getText()).translate(None, string.punctuation))
        
    return rs

"""Retrieves the first 100 expanded sets for a query string,
given the html page of the google search page.Expanded set 
consists of the result header and the summary text
strip for each of the search results in the returned page."""
def google_expanded_docs(page):
    es = list()
    if page is not None:
        soup = BeautifulSoup(page, 'lxml')
        # Strip the extended set using beautifulsoup.
        esdivs = soup.findAll("div", { "class" : "g" })    
        for esdiv in esdivs:
            for d in esdiv.findAll('div', {'class':'rc'}):
                doc = ""
                for hr in d.findAll('h3', {'class':'r'}):
                    doc += hr.getText()
                    doc += ' '
                for ds in d.findAll('div', {'class':'s'}):
                    for s in ds.findAll('span', {'class' : 'st'}):
                        doc += s.getText()                    
                es.append(to_ascii(doc).translate(None, string.punctuation))
            
    return es

In [14]:
def test_kval(q, c):
    k = kval(q, c)
    print k

In [97]:
%%time
q = 'delete cookies'
c = 'cookies recipe'    
test_kval(q, c)

0.235908682104
CPU times: user 769 ms, sys: 43.5 ms, total: 812 ms
Wall time: 3.84 s


In [98]:
%%time
q = 'delete cookies'
c = 'chocolate cookies'    
test_kval(q, c)

0.214376190264
CPU times: user 881 ms, sys: 88.5 ms, total: 969 ms
Wall time: 3.92 s


In [99]:
%%time
q = 'delete cookies'
c = 'how to delete cookies on chrome'    
test_kval(q, c)

0.787332940935
CPU times: user 731 ms, sys: 27.9 ms, total: 758 ms
Wall time: 3.45 s


In [101]:
def get_rel_queries(l):
    q = list()
    for s in l:
        r = google_related_searches(s)
        q.extend(r)
    return q

In [102]:
q = 'anonymous web proxy'
first = google_related_searches(q)
second = get_rel_queries(first)
third = get_rel_queries(second)

In [104]:
print 'Seed: ' + q
print '1st iteration'
print '============='
for c in first:
    print c.ljust(35) + str(kval(q, c))
    sleep(randint(3, 5))

Seed: anonymous web proxy
1st iteration
unblocked proxy sites              0.625885275915
proxy meaning                      0.341629813043
best proxy server                  0.543056710652
proxy sites list                   0.571541581749
proxy sites for youtube            0.580685721987
free proxy list                    0.49021895171
skull proxy                        0.445939834306
proxy sites for school             0.546793161541


In [125]:
print 'Seed: ' + q
print '2nd iteration'
print '============='
for c in second:
    print c.ljust(35) + str(kval(q, c))
    sleep(randint(3, 5))

Seed: anonymous web proxy
2nd iteration
unblocked proxy sites for school   0.520873715037
unblocked proxy sites 2016         0.552267777783
new proxy sites                    0.613555317019
unblock blocked websites           0.30676562345
unblocked proxies for school 2016  0.46395945235
unblocked proxy list               0.639060012854
fresh unblocked proxy sites        0.605553770321
unblock websites                   0.41211288729
proxy war meaning                  0.162943518505
proxy meaning in hindi             0.25767848335
proxy synonym                      0.241219822191
proxy meaning in telugu            0.218214585157
proxy meaning in tamil             0.234457407832
proxy in a sentence                0.20749551871
proxy definition creepypasta       0.196357555054
proxy definition science           0.245467175434
skull proxy                        0.449638036921
best proxy server software         0.422672614541
skullproxy                         0.438836239905
top proxy sites

SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.


In [127]:
print 'Seed: ' + q
print '3rd iteration'
print '============='
for c in third:
    print c.ljust(35) + str(kval(q, c))
    sleep(randint(3, 5))

Seed: anonymous web proxy
3rd iteration
Error retrieving google search results: HTTP Error 503: Service Unavailable
Error retrieving google search results: HTTP Error 503: Service Unavailable
Google throttling, wait a couple of minutes and try again.


SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.
