In [22]:
SUPPORT_TRASHOLD=0.005
import pyspark.sql.functions as F
from pyspark.sql import Row
import time
import datetime
import math
import re

def findTimeStampAndRoundIt(s):
    match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', s)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d %H:%M:%S')
    a, b = divmod(math.floor(date.minute/10)*10, 60)
    date = date.replace(minute=b,second=0)
    return date

def removeTimestamp(row):
    find = re.compile(r'\d{4}-\d{2}-\d{2}')
    start = re.search(find,row).start()
    return row[0:start]

def spliteToUserIdAndUserSearch(row,timestamp):
    row = row.split("\t", 1)
    row[1] =  row[1].rstrip('\t')
    return ((int(row[0]),timestamp.strftime("%Y-%m-%d %H:%M:%S")),row[1])

def uniqueList(line):
    uniqueSearches = set(line[1])
    newLine = [line[0],list(uniqueSearches)]
    return newLine

log_txt=sc.textFile("user-searches-min.txt")
header = log_txt.first()

#filter out the header, make sure the rest looks correct
log_txt = log_txt.filter(lambda line: line != header)

logSearch = log_txt.map(lambda line: spliteToUserIdAndUserSearch(removeTimestamp(line),findTimeStampAndRoundIt(line))).distinct()

TypeError: No default accumulator param for type <class 'list'>

In [23]:
start = time.time()
totalOfTransactions = logSearch.map(lambda q: (q[0], 1) ).reduceByKey(lambda c1,c2: c1+c2 ).count()

In [24]:
print(totalOfTransactions)

10


In [25]:
# remove the user id -> return only query 
all_queries = logSearch.map(lambda line: line[1])
print(all_queries.take(10))
# count how much time query is show for all the user divide by number of users
suportX = all_queries.map(lambda q: (q, 1) ).reduceByKey(lambda c1,c2: c1+c2 ) \
                                                    .map(lambda x: (x[0], x[1] / totalOfTransactions)) \
                                                    .filter(lambda x: x[1] > SUPPORT_TRASHOLD)

# suportX is list of queries that pass the thrasholds of support
suportX.take(10)

['a', 'b', 'b', 'a', 'a', 'c', 'c', 'myspace.com', 'dfdf', 'vaniqa.comh']


[('a', 0.3), ('b', 0.2), ('c', 0.2)]

In [41]:
# get unique set of all queries
validItems = suportX.map(lambda x:x[0]).collect()


In [42]:
def include_queries_by_support_thrasholds(inValidList,validItems):
    return [s for s in inValidList if s in validItems]


In [43]:
user_query = logSearch.map(lambda x: (x[0][0],x[1])).groupByKey().mapValues(list).filter(lambda kv: len(kv[1]) > 1)  
user_query.take(20)
user_query = user_query.map(lambda t: (t[0],include_queries_by_support_thrasholds(t[1],validItems)))
user_query.take(20)

[(1, ['a', 'b', 'c']), (2, ['b', 'a']), (3, ['a', 'c'])]

In [29]:
def get_all_pairs(arr):
        result = []
        for p1 in range(len(arr)):
                for p2 in range(p1+1,len(arr)):
                        result.append((arr[p1],arr[p2]))
        return result
    
all_queries_pairs_tuples = user_query.map(lambda kv: kv[1]).flatMap(lambda arr: get_all_pairs(arr))
all_queries_pairs_tuples.take(50)
# userId | a, b ,c ,d
# 1| 1,1,0,0->a,b
# 2|0,1,1,0->b,c

[('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'a'), ('a', 'c')]

In [30]:

def sort_small_list(arr):
    if(arr[0] <= arr[1]):
        return arr
    return [arr[1],arr[0]]
# the sort is for (a,b) (b,a) = > (a,b) (a,b) => ((a,b),2)
all_queries_tuples_sorted = all_queries_pairs_tuples.map(lambda kv: sort_small_list(list(kv))) \
                                                .map(lambda arr: (arr[0],arr[1]) )

all_queries_pairs_tuples_count = all_queries_tuples_sorted.map(lambda kv: (kv,1)) \
                                                    .reduceByKey(lambda c1,c2: c1+c2 )\
                                                    .filter(lambda kv: kv[1] > 1) \
                                                    .map(lambda x: (x[0], x[1] / totalOfTransactions)) 


all_queries_pairs_tuples_count.take(10)


[(('a', 'b'), 0.2), (('a', 'c'), 0.2)]

In [31]:
rdd_queries_tuples_cartesian = all_queries_pairs_tuples_count.cartesian(suportX)
rdd_queries_tuples_cartesian.take(3)

[((('a', 'b'), 0.2), ('a', 0.3)),
 ((('a', 'b'), 0.2), ('b', 0.2)),
 ((('a', 'b'), 0.2), ('c', 0.2))]

In [32]:
# this command calculate XUY/X by taking all lines that ((x ,y , number of suply(xUy)),(z ,number of suply(z))) when z ==x
rdd_join_left = rdd_queries_tuples_cartesian.filter(lambda lr: lr[0][0][0] == lr[1][0]) \
                                            .map(lambda lr: (lr[0][0][0],lr[0][0][1],float(lr[0][1]) / lr[1][1]))

print(rdd_join_left.take(5))
# this command take XUY/Y

rdd_join_right = rdd_queries_tuples_cartesian.filter(lambda lr: lr[0][0][1] == lr[1][0])\
                                             .map(lambda lr: (lr[0][0][1], lr[0][0][0], float(lr[0][1]) / lr[1][1]))
print(rdd_join_right.take(5))

rdd_query_conf = sc.union([rdd_join_left, rdd_join_right])

[('a', 'b', 0.6666666666666667), ('a', 'c', 0.6666666666666667)]
[('b', 'a', 1.0), ('c', 'a', 1.0)]


In [33]:
end = time.time()
elapsed_time = time.strftime("%H:%M:%S", time.gmtime(end - start))
print("elapsed time: %s" % elapsed_time)

elapsed time: 00:00:06


In [None]:
#######################
####### TASK 2  #######
#######################
rddQueryConfDF = sqlContext.createDataFrame(rdd_query_conf, ["X", "Y","CONFIDENCE"])
rddQueryConfDF.coalesce(1).write.format('com.databricks.spark.csv').save('/home/kfir/Desktop/Ex2/Final/my.csv',header = 'true')

In [None]:
#########################
####### TASK 4.a ########
#########################

def filterConfidence(line,Conf):
    if line[2] >=Conf:
        return(True)
    return(False)

x_y_conf06 = rdd_query_conf.filter(lambda line: filterConfidence(line,0.6))
x_y_conf08 = x_y_conf06.filter(lambda line: filterConfidence(line,0.8))
x_y_conf09 = x_y_conf08.filter(lambda line:filterConfidence(line,0.9))
print('the amount of related queries for 0.6 confidence')
print(x_y_conf06.count())

print('the amount of related queries for 0.8 confidence')
print(x_y_conf08.count())

print('the amount of related queries for 0.9 confidence')
print(x_y_conf09.count())


In [113]:
# def getAllPairsWithMinConfidence(conf,obj):
#     return obj.filter(lambda x: x[2]>=conf).count()

In [115]:
# print(getAllPairsWithMinConfidence(0.2,rdd_query_conf))
# print(getAllPairsWithMinConfidence(0.4,rdd_query_conf))
# print(getAllPairsWithMinConfidence(0.6,rdd_query_conf))



0
0
0
