# Rule Based Machine Learning Algorithm

In [1]:
dataset_filename = 'user-ct-test-collection-01.txt'
rules_filename = 'rules.csv'

## Hyperparameters

In [2]:
hyperparameters = {
    'strings_to_remove': 'myspace|google|yahoo|www.|.html|.com|.gov|.org|.co|.net|.biz|.edu|.mil',
    'min_length': 1,
    'min_support': 5,
    'sample_percentage': -1,
    'is_training': False,
}

## Get Dataset

In [3]:
if hyperparameters['is_training']:
    import os
    import gzip
    import ssl
    import shutil
    import urllib2

    if os.path.isfile(dataset_filename):
        print 'Dataset found'
    else:
        print 'Downloading dataset'
        f = urllib2.urlopen('https://github.com/dimosr/Big_Data/raw/master/resources/user-ct-test-collection-01.txt.gz', 
                            context=ssl.SSLContext(ssl.PROTOCOL_TLS))
        with open(os.path.basename(dataset_filename + '.gz'), 'wb') as local_file:
            local_file.write(f.read())
        print 'Download complete, unzipping'
        with gzip.open(dataset_filename + '.gz', 'rb') as f_in:
            with open(dataset_filename, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(dataset_filename + '.gz')        
        print 'Unzip complete'

## Preparing the Dataset
##### Delete too generic queries such as "google", "yahoo", etc.
##### Delete too short queries.
##### create our main rdd: each user has it's queries list, with the number 1 attached to the query, as a preperation for the reduce stage.

In [4]:
if hyperparameters['is_training']:
    from pyspark.sql.functions import *

    # reading the data
    rdd = sc.textFile(dataset_filename).map(lambda x: x.split('\t'))
    header = rdd.first()
    rdd = rdd.filter(lambda line: line != header)
    # preprocessing
    df = rdd.toDF(header)
    if hyperparameters['sample_percentage'] > 0:
        df = df.sample(False, hyperparameters['sample_percentage'], 42)
    # clean out some queries
    df = df.withColumn('Query', regexp_replace('Query', hyperparameters['strings_to_remove'], ''))
    df = df.withColumn('Query', trim(col('Query')))
    df = df.filter(length('Query') > hyperparameters['min_length'])
    # reordering the data to the format - (AnonId, [(Query i, 1), (Query i+1, 1), ...])
    rdd = df.rdd.map(list).map(
        lambda x: (x[0], [x[1]])).reduceByKey(
        lambda x, y: x + y).map(
        lambda x: (x[0], list({(y, 1) for y in x[1]})))

## Building the Rules Set

### Create Queries Frequencies
##### here we create a seperate rdd that contain all of the queries from the dataset and their frequencies.
##### we filter out queries with frequency that is lower than min support

In [5]:
if hyperparameters['is_training']:
    # creating an RDD with all queries and their frequencies in the dataset - (Query, Frequency)
    query_frequency = rdd.flatMap(
        lambda x: x[1]).reduceByKey(
        lambda x, y: x + y).filter(
        # filtering out queries that are below the min support threshold
        lambda x: x[1] >= hyperparameters['min_support'])

### Create Query Pairs Frequencies
##### we create all of the possible pairs of queries and their frequencies.

In [6]:
if hyperparameters['is_training']:
    import itertools

    SEPARATOR = '####'
    rdd = rdd.map(
        # get (AnonId, [Query i, Query i+1, ...])
        lambda x: (x[0], [y[0] for y in x[1]])).map(
        # get all combinations of pairs of queries in the format [(Query i####Query j, 1), ...]
        lambda x: [(pair[0] + SEPARATOR + pair[1], 1) for pair in itertools.combinations(x[1], 2)]).flatMap(
        lambda x: x).reduceByKey(
        # reduce count the pairs frequencies resulting in [(Query i####Query i+1, Frequency), ...]
        lambda x, y: x + y).map(
        # remove the separator so format is now [(Query i, Query i+1, Frequency), ...]
        lambda x: (x[0].split(SEPARATOR)[0], x[0].split(SEPARATOR)[1], x[1])).map(
        # handle (QueryI, QueryJ, Frequency) vs. (QueryJ, QueryI, Frequency) which is the same by joining
        lambda x: (((x[0], (x[1], x[2]))), (x[1], (x[0], x[2])))).flatMap(
        # finally return (Query i, (Query i+1, Frequency))
        lambda x: x)

### Join Query Pairs Frequencies with Queries Frequencies
##### we join the main rdd with the query frequencirs rdd, in order to get all of the query pairs, the frequency of the queries as they appear together, and the frequency of the first query from the pair

In [7]:
if hyperparameters['is_training']:
     # join the two rdds to get results in the format of (X, (Y, Frequency X and Y), Frequency X)
    rdd = rdd.join(query_frequency).map(
        # flatten to the format [X, Y, Frequency X and Y, Frequency X]
        lambda x: [x[0]] + list(x[1][0]) + [x[1][1]])

### Calculate Confidence and Write to CSV
##### In order to get the confidence, we divide the query pair frequency by the frequency of the first query from the pair.

In [8]:
if hyperparameters['is_training']:
    import os
    import shutil
    
    if os.path.isdir(rules_filename):
        shutil.rmtree(rules_filename)
    rdd.map(
        # calculate confidence and output the format [X, Y, confidence X=>Y]
        lambda x: [x[0], x[1], float(x[2]) / x[3]]).filter(
        # filter queries with confidence < 0.6 since we only care about >= 0.6 for the exercise
        lambda x: x[2] >= 0.6).map(
        # save the results to a csv in the format of Sx, Sy, Confidence Sx=>Sy
        lambda x: ','.join(unicode(y) for y in x)).saveAsTextFile(rules_filename)

## Loading the Rules Set

In [9]:
if os.path.isdir(rules_filename):
    from pyspark.sql.types import *

    # read rules set from csv files to dataframe
    df = sqlContext.read.format('csv').schema(StructType(
        [StructField('Sx', StringType(), True), 
         StructField('Sy', StringType(), True), 
         StructField('Confidence', FloatType(), True)])).load(rules_filename)
else:
    print 'Rules set not found, run training first to create it'

### Confidence = 0.6

In [10]:
df.orderBy('Confidence').show(1000)

+--------------------+--------------------+----------+
|                  Sx|                  Sy|Confidence|
+--------------------+--------------------+----------+
|           padre pio|                ebay|       0.6|
|       cute  layouts|             layouts|       0.6|
|        auto locator|                ebay|       0.6|
|            hot guys|           askjeeves|       0.6|
|           lierotica|          literotica|       0.6|
|       maronda homes|            mapquest|       0.6|
|              porche|             hotmail|       0.6|
|    layout generator|             layouts|       0.6|
|     bmw motorcycles| triumph motorcycles|       0.6|
|           wow cable|             walmart|       0.6|
|       cute  layouts|                  ns|       0.6|
|          sc lottery|                ebay|       0.6|
|        sofa express|             walmart|       0.6|
|       soverign bank|      sovereign bank|       0.6|
| goodmorning america|                 abc|       0.6|
|         

#### Interesting results with confidence 0.6:

1. goodmorning america => abc:
Good Morning America" is the Emmy-winning morning news program featuring anchors George Stephanopoulos, Robin Roberts, Lara Spencer, Amy Robach and Ginger Zee.
The program is broadcast on abc channel.
For more information: https://abc.go.com/shows/good-morning-america

2. gojane => target:
target is an online shopping website: https://www.target.com/

gojane is a women's footware brand, whose products are sold in target stores: https://www.gojane.com/.

3. gta san andreas c... => game cheats:
gta san andreas is a video game for PC, PS4 and more consoles: http://www.rockstargames.com/sanandreas/.

in this game you can get access to some elemnts (guns, cars, planes, etc) by using cheats.





### Confidence = 0.8

In [11]:
df.filter('Confidence >= 0.8').orderBy('Confidence').show(1000)

+--------------------+--------------------+----------+
|                  Sx|                  Sy|Confidence|
+--------------------+--------------------+----------+
|         citizensban|        citizensbank|       0.8|
|           capdtimes|            mapquest|       0.8|
|american idol mes...|       american idol|       0.8|
|              eather|             weather|       0.8|
|            tattoo's|              tattoo|       0.8|
|               hotml|             hotmail|       0.8|
|        white castle|             walmart|       0.8|
|bakersfield calif...|                ebay|       0.8|
|         cicuit city|        circuit city|       0.8|
|          hootersair|            mapquest|       0.8|
|           caigslist|          craigslist|       0.8|
|             my spae|            my space|       0.8|
|       eelpresidente|      beelpresidente|       0.8|
|          nascarhttp|              nascar|       0.8|
|         live 20help|           live help|       0.8|
|         

#### Interesting results with confidence 0.8:
1. fireplace inserts => lowes:
lowes  is a company for home products, and they also sell fireplaces. 
https://www.lowes.com/search?searchTerm=fireplace

2. yugiohgx => cartoowork:
yu-gi-ho is a tv series that broadcast on cartoon network channel.

3. flavorflav => vh1:
flavorflav is the stage name on an american musician whose songs were played on vh1 music channel.

### Confidence = 0.9

In [12]:
df.filter('Confidence >= 0.9').orderBy('Confidence').show(1000)

+------------------+-------------------+----------+
|                Sx|                 Sy|Confidence|
+------------------+-------------------+----------+
|        bookstores|               ebay|       1.0|
|      amercan idol|      american idol|       1.0|
|     american idal|      american idol|       1.0|
|          ap quest|          map quest|       1.0|
|              uest|           mapquest|       1.0|
|           idelity|           fidelity|       1.0|
|     astrologyzone|          evo.qksrv|       1.0|
|william shakespear|william shakespeare|       1.0|
+------------------+-------------------+----------+



#### Interesting results with confidence 0.9:
bookstores => ebay:
ebay sell almost everything, including books.