In [1]:
from pyspark import SparkContext
from itertools import combinations

In [2]:
sc = SparkContext(appName="Assignment1")

22/04/09 20:53:54 WARN Utils: Your hostname, jarvis resolves to a loopback address: 127.0.1.1; using 192.168.1.7 instead (on interface wlp4s0)
22/04/09 20:53:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/09 20:54:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Ex 1.1

### Import the Dataset 

In [3]:
data = sc.textFile('dataset/conditions.csv')            # reading the dataset
header = data.first()                                   # first line is the header
data = data.filter(lambda line: line != header)\
    .map(lambda line: line.split(","))\
    .map(lambda line: (line[2], {line[-2]}))\
    .reduceByKey(lambda code1, code2: code1 | code2)

    # take the header off the list of rows
    # split each row by the cells
    # get only the patient number and the {condition code}
    # group the conditions of every patient in a set 

                                                                                

In [4]:
data.take(2)

                                                                                

[('3826037f-19e0-4c7b-98e5-4e9578472f67',
  {'162864005', '24079001', '55822004', '65966004'}),
 ('8bfe7590-c750-405d-9699-07218f203475',
  {'198992004',
   '284549007',
   '43878008',
   '44465007',
   '444814009',
   '446096008',
   '53741008',
   '72892002'})]

### First Pass

In [5]:
counts = data\
        .flatMap(lambda patient: patient[1])\
        .map(lambda condition: (condition,1))\
        .reduceByKey(lambda count1, count2: count1 + count2)

        # get a list of condition codes
        # (condition, 1)
        # (condition, count)

In [6]:
counts.sortBy(lambda x:x[1], ascending=False).take(10)

                                                                                

[('444814009', 751940),
 ('195662009', 524692),
 ('10509002', 461495),
 ('162864005', 365567),
 ('271737000', 355372),
 ('15777000', 354315),
 ('59621000', 305134),
 ('40055000', 250239),
 ('72892002', 205390),
 ('19169002', 201894)]

### Frequent Items

In [7]:
support_threshold = 1000
frequent_items = counts\
                .filter(lambda condition: condition[1]  >= support_threshold)\
                .map(lambda condition: condition[0])

                # from (condition, count) only leave the ones with a count of at least 1000
                # get a list of the frequent condition codes 

In [8]:
f_items = frequent_items.collect()  # get a list out of the rdd above
f_items[1:10]

['363406005',
 '443165006',
 '368581000119106',
 '40275004',
 '239720000',
 '44465007',
 '283371005',
 '156073000',
 '230265002']

In [9]:
frequent_data = data\
            .map(lambda basket: (basket[0], {code for code in basket[1] if code in f_items}))   # list of (patient number, {frequent condition set})

### Second Pass

In [10]:
k = 2

In [11]:
candidate_pairs = frequent_data\
                .map(lambda basket: (basket[0], {i for i in combinations(basket[1],k)}))    # get a list of (patient number, {candidate frequent pairs})

In [12]:
candidate_pairs.first()

('3826037f-19e0-4c7b-98e5-4e9578472f67',
 {('162864005', '24079001'),
  ('55822004', '162864005'),
  ('55822004', '24079001'),
  ('65966004', '162864005'),
  ('65966004', '24079001'),
  ('65966004', '55822004')})

In [13]:
counts2 = candidate_pairs\
        .flatMap(lambda patient: patient[1])\
        .map(lambda condition: (condition,1))\
        .reduceByKey(lambda count1, count2: count1 + count2)
        
        # list of candidate frequent pairs
        # ((frequent pair), 1)
        # ((frequent pair), count)

In [14]:
counts2.sortBy(lambda tuplet: tuplet[1], ascending=False).map(lambda tuplet: tuplet[0]).take(10)

                                                                                

[('15777000', '271737000'),
 ('444814009', '195662009'),
 ('444814009', '162864005'),
 ('10509002', '444814009'),
 ('15777000', '444814009'),
 ('271737000', '444814009'),
 ('59621000', '444814009'),
 ('10509002', '195662009'),
 ('40055000', '444814009'),
 ('271737000', '195662009')]

### Third Pass

In [15]:
k = 3

In [16]:
candidate_triples = frequent_data\
                .map(lambda basket: (basket[0], {i for i in combinations(basket[1],k)}))    # get a list of (patient number, {candidate frequent triples})

In [17]:
candidate_triples.first()

('3826037f-19e0-4c7b-98e5-4e9578472f67',
 {('55822004', '162864005', '24079001'),
  ('65966004', '162864005', '24079001'),
  ('65966004', '55822004', '162864005'),
  ('65966004', '55822004', '24079001')})

In [18]:
counts3 = candidate_pairs\
        .flatMap(lambda patient: patient[1])\
        .map(lambda condition: (condition,1))\
        .reduceByKey(lambda count1, count2: count1 + count2)
        
        # list of candidate frequent triples
        # ((frequent triple), 1)
        # ((frequent triple), count)

In [19]:
counts3.sortBy(lambda triplet: triplet[1], ascending=False).map(lambda triplet: triplet[0]).take(10)

                                                                                

[('15777000', '271737000', '444814009'),
 ('15777000', '271737000', '195662009'),
 ('10509002', '444814009', '195662009'),
 ('15777000', '444814009', '195662009'),
 ('271737000', '444814009', '195662009'),
 ('444814009', '162864005', '195662009'),
 ('15777000', '10509002', '271737000'),
 ('10509002', '444814009', '162864005'),
 ('15777000', '10509002', '444814009'),
 ('59621000', '444814009', '195662009')]

In [20]:
max_k = 3
support_threshold = 1000

for k in range(1,max_k + 1):  
    
    if k == 1:
        counts = data.flatMap(lambda patient: patient[1]).map(lambda condition: (condition,1)).reduceByKey(lambda count1, count2: count1 + count2)
        frequent_items = counts.filter(lambda condition: condition[1]  >= support_threshold).map(lambda condition: condition[0]).collect()
        data = data.map(lambda basket: (basket[0], {code for code in basket[1] if code in f_items}))

    else:
        candidate_pairs = data.map(lambda basket: (basket[0], {i for i in combinations(basket[1],k)}))
        print(candidate_pairs.flatMap(lambda patient: patient[1]).map(lambda condition: (condition,1)).reduceByKey(lambda count1, count2: count1 + count2).sortBy(lambda triplet: triplet[1], ascending=False).map(lambda triplet: triplet[0]).take(10))

    k += 1

                                                                                

[('15777000', '271737000'), ('444814009', '195662009'), ('444814009', '162864005'), ('10509002', '444814009'), ('15777000', '444814009'), ('271737000', '444814009'), ('59621000', '444814009'), ('10509002', '195662009'), ('40055000', '444814009'), ('271737000', '195662009')]




[('15777000', '271737000', '444814009'), ('15777000', '271737000', '195662009'), ('10509002', '444814009', '195662009'), ('15777000', '444814009', '195662009'), ('271737000', '444814009', '195662009'), ('444814009', '162864005', '195662009'), ('15777000', '10509002', '271737000'), ('10509002', '444814009', '162864005'), ('15777000', '10509002', '444814009'), ('59621000', '444814009', '195662009')]


                                                                                

# Ex 1.2