# 9장 Recommendation Engines Using MapReduce

## Friendship connection

![](spark09_01.jpg)

### Input Data

![](spark09_02.jpg)

- friends2.txt
```
1 2,3,4,5,6,7,8
2 1,3,4,5,7
3 1,2
4 1,2,6
5 1,2
6 1,4
7 1,2
8 1
```

### Output 

- USER :  F(M: [I1, I2, I3, ...]), ...
    - F : USER에게 친구로 추천하는 사람의 ID
    - M : 같이 친구의 명수 
    - I1, I2, I3,  : 같이 친구인 사람의 ID

```
4: 3 (2: [1, 2]),5 (2: [1, 2]),7 (2: [1, 2]),8 (1: [1]),
2: 6 (2: [1, 4]),8 (1: [1]),
6: 2 (2: [1, 4]),3 (1: [1]),5 (1: [1]),7 (1: [1]),8 (1: [1]),
8: 2 (1: [1]),3 (1: [1]),4 (1: [1]),5 (1: [1]),6 (1: [1]),7 (1: [1]),
3: 4 (2: [1, 2]),5 (2: [1, 2]),6 (1: [1]),7 (2: [1, 2]),8 (1: [1]),
1:
7: 3 (2: [1, 2]),4 (2: [1, 2]),5 (2: [1, 2]),6 (1: [1]),8 (1: [1]),
5: 3 (2: [1, 2]),4 (2: [1, 2]),6 (1: [1]),7 (2: [1, 2]),8 (1: [1]),
```

## Spark Implementation

![](spark09_03.jpg)

### Step 3: Create a Spark context object

In [2]:
from pyspark import SparkContext
sc = SparkContext() 
sc

<pyspark.context.SparkContext at 0x7f8e5db3da50>

### Step 4: Read the HDFS input file and create an RDD

In [36]:
records = sc.textFile("friends2.txt", 1);

In [38]:
for t in records.collect():
    print "debug0 record:", t

debug0 record: 1	2,3,4,5,6,7,8
debug0 record: 2	1,3,4,5,7
debug0 record: 3	1,2
debug0 record: 4	1,2,6
debug0 record: 5	1,2
debug0 record: 6	1,4
debug0 record: 7	1,2
debug0 record: 8	1


### Step 5: Implement the map() function

In [39]:
def make_pairs( record ) :
    # // record=<person><TAB><friend1><,><friend2><,><friend3><,>...
    tokens = record.split("\t")
    person = long( tokens[0] )
    friendsAsString = tokens[1]
    friendsTokenized = friendsAsString.split(",");
    
    friends = []  ## LIST형
    mapperOutput = [] ## LIST형
    for friendAsString in  friendsTokenized :
        toUser = long( friendAsString )
        friends.append( toUser  ) 
        directFriend = ( toUser, -1L )  # 튜플형
        mapperOutput.append( ( person, directFriend )  )
        
    for i  in range( len(friends) )  :
        for j in range( i+1,  len(friends) )  :
            possibleFriend1 = ( friends[j], person )
            mapperOutput.append( (friends[i], possibleFriend1)  ) 
            
            possibleFriend2 = ( friends[i], person )
            mapperOutput.append( (friends[j], possibleFriend2) ) 
            
    return mapperOutput

In [40]:
pairs = records.flatMap( make_pairs  )

In [41]:
debug2 = pairs.collect()
for t2 in debug2 :
    print "debug1 key={}\t value={}".format( t2[0],  t2[1] ) ; 

debug1 key=1	 value=(2L, -1L)
debug1 key=1	 value=(3L, -1L)
debug1 key=1	 value=(4L, -1L)
debug1 key=1	 value=(5L, -1L)
debug1 key=1	 value=(6L, -1L)
debug1 key=1	 value=(7L, -1L)
debug1 key=1	 value=(8L, -1L)
debug1 key=2	 value=(3L, 1L)
debug1 key=3	 value=(2L, 1L)
debug1 key=2	 value=(4L, 1L)
debug1 key=4	 value=(2L, 1L)
debug1 key=2	 value=(5L, 1L)
debug1 key=5	 value=(2L, 1L)
debug1 key=2	 value=(6L, 1L)
debug1 key=6	 value=(2L, 1L)
debug1 key=2	 value=(7L, 1L)
debug1 key=7	 value=(2L, 1L)
debug1 key=2	 value=(8L, 1L)
debug1 key=8	 value=(2L, 1L)
debug1 key=3	 value=(4L, 1L)
debug1 key=4	 value=(3L, 1L)
debug1 key=3	 value=(5L, 1L)
debug1 key=5	 value=(3L, 1L)
debug1 key=3	 value=(6L, 1L)
debug1 key=6	 value=(3L, 1L)
debug1 key=3	 value=(7L, 1L)
debug1 key=7	 value=(3L, 1L)
debug1 key=3	 value=(8L, 1L)
debug1 key=8	 value=(3L, 1L)
debug1 key=4	 value=(5L, 1L)
debug1 key=5	 value=(4L, 1L)
debug1 key=4	 value=(6L, 1L)
debug1 key=6	 value=(4L, 1L)
debug1 key=4	 value=(7L, 1L)
debug1 

### Step 6: Implement the reduce() function

In [42]:
grouped = pairs.groupByKey()

In [43]:
debug3 = grouped.collect()
for t3 in debug3 :
    print "debug3 key={}\t value={}".format( t3[0],  "".join([str(x) for x in t3[1]] )   )

debug2 key=1	 value=(2L, -1L)(3L, -1L)(4L, -1L)(5L, -1L)(6L, -1L)(7L, -1L)(8L, -1L)(3L, 2L)(4L, 2L)(5L, 2L)(7L, 2L)(2L, 3L)(2L, 4L)(6L, 4L)(2L, 5L)(4L, 6L)(2L, 7L)
debug2 key=2	 value=(3L, 1L)(4L, 1L)(5L, 1L)(6L, 1L)(7L, 1L)(8L, 1L)(1L, -1L)(3L, -1L)(4L, -1L)(5L, -1L)(7L, -1L)(1L, 3L)(1L, 4L)(6L, 4L)(1L, 5L)(1L, 7L)
debug2 key=3	 value=(2L, 1L)(4L, 1L)(5L, 1L)(6L, 1L)(7L, 1L)(8L, 1L)(1L, 2L)(4L, 2L)(5L, 2L)(7L, 2L)(1L, -1L)(2L, -1L)
debug2 key=4	 value=(2L, 1L)(3L, 1L)(5L, 1L)(6L, 1L)(7L, 1L)(8L, 1L)(1L, 2L)(3L, 2L)(5L, 2L)(7L, 2L)(1L, -1L)(2L, -1L)(6L, -1L)(1L, 6L)
debug2 key=5	 value=(2L, 1L)(3L, 1L)(4L, 1L)(6L, 1L)(7L, 1L)(8L, 1L)(1L, 2L)(3L, 2L)(4L, 2L)(7L, 2L)(1L, -1L)(2L, -1L)
debug2 key=6	 value=(2L, 1L)(3L, 1L)(4L, 1L)(5L, 1L)(7L, 1L)(8L, 1L)(1L, 4L)(2L, 4L)(1L, -1L)(4L, -1L)
debug2 key=7	 value=(2L, 1L)(3L, 1L)(4L, 1L)(5L, 1L)(6L, 1L)(8L, 1L)(1L, 2L)(3L, 2L)(4L, 2L)(5L, 2L)(1L, -1L)(2L, -1L)
debug2 key=8	 value=(2L, 1L)(3L, 1L)(4L, 1L)(5L, 1L)(6L, 1L)(7L, 1L)(1L, -1L)


### Step 7: Generate final output

In [57]:
def buildRecommendations( mutualFriends)  :
    from cStringIO import StringIO
    strIOs = StringIO()
    
    for key in mutualFriends.keys() :
        values = mutualFriends[key]
        if values == None :
            continue
        
        strIOs.write( "%s(%d:%s)," 
                     %(key, len( values ), values)
                     )
        
    return strIOs.getvalue()

In [58]:
def make_recommend( values )  :
    mutualFriends = {}  # HashMap 
    for t2 in values :
        toUser = t2[ 0 ]
        mutualFriend = t2[ 1 ]
        alreadyFriend = (mutualFriend == -1L )
        
        if toUser in mutualFriends :
            if alreadyFriend :
                mutualFriends[  toUser ] = None
            elif mutualFriends[  toUser ] != None :
                mutualFriends[  toUser ].append(  mutualFriend )
        else :
            if alreadyFriend :
                mutualFriends[ toUser ] = None
            else :
                list1 = [ mutualFriend ]
                mutualFriends[ toUser ] = list1
    
    return buildRecommendations( mutualFriends )

In [59]:
recommendations = grouped.mapValues( make_recommend )

In [60]:
debug4 = recommendations.collect()
for t4 in debug4 :
    print "debug4 key={}\t value={}".format( t4[0],  "".join([str(x) for x in t4[1]] )   )

debug4 key=1	 value=
debug4 key=2	 value=6(2:[1L, 4L]),8(1:[1L]),
debug4 key=3	 value=4(2:[1L, 2L]),5(2:[1L, 2L]),6(1:[1L]),7(2:[1L, 2L]),8(1:[1L]),
debug4 key=4	 value=3(2:[1L, 2L]),5(2:[1L, 2L]),7(2:[1L, 2L]),8(1:[1L]),
debug4 key=5	 value=3(2:[1L, 2L]),4(2:[1L, 2L]),6(1:[1L]),7(2:[1L, 2L]),8(1:[1L]),
debug4 key=6	 value=2(2:[1L, 4L]),3(1:[1L]),5(1:[1L]),7(1:[1L]),8(1:[1L]),
debug4 key=7	 value=3(2:[1L, 2L]),4(2:[1L, 2L]),5(2:[1L, 2L]),6(1:[1L]),8(1:[1L]),
debug4 key=8	 value=2(1:[1L]),3(1:[1L]),4(1:[1L]),5(1:[1L]),6(1:[1L]),7(1:[1L]),
