# Rule Based Machine Learning Algorithm

In [1]:
filename = 'user-ct-test-collection-01.txt'

In [2]:
!rm -rf user-ct-test-collection-01.txt

## Get Dataset

In [3]:
import os
import gzip
import ssl
import shutil
import urllib2

if os.path.isfile(filename):
    print 'Dataset found'
else:
    print 'Downloading dataset'
    f = urllib2.urlopen('https://github.com/dimosr/Big_Data/raw/master/resources/user-ct-test-collection-01.txt.gz', 
                        context=ssl.SSLContext(ssl.PROTOCOL_TLS))
    with open(os.path.basename(filename + '.gz'), 'wb') as local_file:
        local_file.write(f.read())
    print 'Download complete, unzipping'
    with gzip.open(filename + '.gz', 'rb') as f_in:
        with open(filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(filename + '.gz')        
    print 'Unzip complete'

Downloading dataset
Download complete, unzipping
Unzip complete


## Preparing the Dataset

In [22]:
from pyspark.sql.functions import *

# reading the data
rdd = sc.textFile(filename).map(lambda x: x.split('\t'))
header = rdd.first()
rdd = rdd.filter(lambda line: line != header)

# preprocessing
df = rdd.toDF(header)
df = df.withColumn('Query', regexp_replace('Query', 'www.|.html|.com|.gov|.org|.co|.net|.edu|.mil', ''))
df = df.withColumn('Query', trim(col('Query')))
df = df.filter(length('Query') > 1)
df.createOrReplaceTempView('df')
df = spark.sql('SELECT AnonID, Query FROM df GROUP BY 1, 2')

# reordering the data
rdd = df.rdd.map(list).map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x + y)
rdd.collect()

[(u'8883165',
  [u'zora banks pics',
   u'tennis shoes',
   u'precious in his sight',
   u'mortgage calculator',
   u'precious-in-his-site',
   u'wellsfar',
   u'preciousinhissite',
   u'wellsfargo',
   u'nursey dration',
   u'precious in his site',
   u'xnxx',
   u'counterstools',
   u'pier',
   u'weargo',
   u'worldmarket',
   u'womens shoes',
   u'baby dolls with glasses',
   u'air jordan shoes',
   u'preciousinhissight',
   u'american girl dolls',
   u'pierone',
   u'clip art',
   u'tennie shoes']),
 (u'11351870',
  [u'qvc blooper',
   u'arcade in lake ge village',
   u'grubs',
   u'stores in lake ge village',
   u'ncis',
   u'lorelei resturant keys',
   u'lorelei resturant',
   u'florida teacher in the news',
   u'lake ge village',
   u'lorelei restaurant',
   u'debra lafave',
   u'metal-sludge.tv',
   u'wizards in lake ge ny',
   u'kathleenhannarealestate',
   u'theradiochick',
   u'debbielafave',
   u't3zb 2f 2fhqzom 3d5923-20060326-093119-3433',
   u'jeff benton',
   u'get rib 

In [5]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import count, col

# minutes = lambda i: i * 60 

# w = (Window()
#    .partitionBy(col('AnonID'))
#    .orderBy(col('QueryTime').cast('timestamp').cast('long'))
#    .rangeBetween(-minutes(10), 0))

# df.select(col('*'), count('Query').over(w).alias('mean')).show()