# Friends of Friends Data Example

In [2]:
import os
import csv

import StringIO
from os import path as filepath
from pyspark import SparkConf, SparkContext

In [3]:
HDFS = "hdfs://{}".format(os.environ["HDFS"])
USER = filepath.join(HDFS, "user", "ec2-user")
FILE = filepath.join(USER, "fof.txt")

In [4]:
fof = sc.textFile(FILE)
fof.take(5)

[u'noreply@comsoc.org\tbenjamin@bengfort.com',
 u'paul.amayo@linacre.ox.ac.uk\trhodes@maillist.ox.ac.uk',
 u'notifications@github.com\tpartisan-discourse@noreply.github.com',
 u'messaging-digest-noreply@linkedin.com\tbenjamin@bengfort.com',
 u'noreply@youtube.com\tbenjamin@bengfort.com']

In [5]:
import re
pattern = re.compile("\t+")

fof_split = fof.map(lambda x: pattern.split(x)).map(lambda x: (x[0],x[1].split(",")))
fof_split.take(10)

[(u'noreply@comsoc.org', [u'benjamin@bengfort.com']),
 (u'paul.amayo@linacre.ox.ac.uk', [u'rhodes@maillist.ox.ac.uk']),
 (u'notifications@github.com', [u'partisan-discourse@noreply.github.com']),
 (u'messaging-digest-noreply@linkedin.com', [u'benjamin@bengfort.com']),
 (u'noreply@youtube.com', [u'benjamin@bengfort.com']),
 (u'info@meetup.com', [u'benjamin@bengfort.com']),
 (u'benjamin@bengfort.com', [u'keleher@cs.umd.edu']),
 (u'keleher@cs.umd.edu', [u'benjamin@bengfort.com']),
 (u'benjamin@bengfort.com', [u'keleher@cs.umd.edu']),
 (u'keleher@cs.umd.edu', [u'benjamin@bengfort.com'])]

## Basic Operations

### Count the total number of emails

In [6]:
fof_split.count()

141616

### Count the number of unique senders

In [7]:
senders = fof_split.map(lambda x:(x[0]))
unique_senders = senders.distinct()
unique_senders.count()

3395

### Number of emails that came from a certain person

In [8]:
from_sender = fof_split.filter(lambda x: x[0] == 'benjamin@bengfort.com').collect()
len(from_sender)

58333

In [9]:
from_sender = fof_split.filter(lambda x: x[0] == 'ojedatony1616@gmail.com').collect()
len(from_sender)

1718

## Shared Friendship

In the shared friendship task, the goal is to analyze a social network to see which friend relationships users have in common. This is both the first step to downstream analytics like “you might know” recommendations, but also a critical part of social networks that might only want to allow you to share with friends and friends-of-friends.

In [29]:
def pair_friends((person, friends)):
    for friend in friends:
        pair = sorted([person, friend])
        return (tuple(pair), set(friends))

In [30]:
paired = fof_split.map(pair_friends)
paired.take(10)

[((u'benjamin@bengfort.com', u'noreply@comsoc.org'),
  {u'benjamin@bengfort.com'}),
 ((u'paul.amayo@linacre.ox.ac.uk', u'rhodes@maillist.ox.ac.uk'),
  {u'rhodes@maillist.ox.ac.uk'}),
 ((u'notifications@github.com', u'partisan-discourse@noreply.github.com'),
  {u'partisan-discourse@noreply.github.com'}),
 ((u'benjamin@bengfort.com', u'messaging-digest-noreply@linkedin.com'),
  {u'benjamin@bengfort.com'}),
 ((u'benjamin@bengfort.com', u'noreply@youtube.com'),
  {u'benjamin@bengfort.com'}),
 ((u'benjamin@bengfort.com', u'info@meetup.com'), {u'benjamin@bengfort.com'}),
 ((u'benjamin@bengfort.com', u'keleher@cs.umd.edu'), {u'keleher@cs.umd.edu'}),
 ((u'benjamin@bengfort.com', u'keleher@cs.umd.edu'),
  {u'benjamin@bengfort.com'}),
 ((u'benjamin@bengfort.com', u'keleher@cs.umd.edu'), {u'keleher@cs.umd.edu'}),
 ((u'benjamin@bengfort.com', u'keleher@cs.umd.edu'),
  {u'benjamin@bengfort.com'})]

In [37]:
def shared_friends(fla, flb): 
    shared = fla.intersection(flb)
    return shared

In [40]:
reduced = paired.reduceByKey(shared_friends)
reduced = reduced.filter(lambda (k,v): len(v) > 0)
reduced.take(10)

[((u'ben@klemens.org', u'harlan.harris@gmail.com'),
  {u'ajschumacher@gmail.com',
   u'alexcengler@gmail.com',
   u'amgallow@gmail.com',
   u'ben@klemens.org',
   u'board@datacommunitydc.org',
   u'cathryn.rabinowitz@gmail.com',
   u'deborah.bey1@gmail.com',
   u'dmitri.adler@gmail.com',
   u'elbertventura@gmail.com',
   u'emeritus@datacommunitydc.org',
   u'goukas@gmail.com',
   u'heather_klemick@yahoo.com',
   u'j.hustwit@gmail.com',
   u'jk2k.net@gmail.com',
   u'joshsz@gmail.com',
   u'jt@occams.info',
   u'justgrimes@gmail.com',
   u'kate.mereand@gmail.com',
   u'm@winteram.com',
   u'marcus.louie@gmail.com',
   u'me.price.99@gmail.com',
   u'merav@datasociety.co',
   u'mnortrup@yahoo.com',
   u'nora.g.albert@gmail.com',
   u'parrymarc@yahoo.com',
   u'rahul@mereand-sinha.com',
   u'shaungitt@gmail.com'}),
 ((u'mount@cs.umd.edu', u'mount@cs.umd.edu'),
  {u'mount@cs.umd.edu', u'pagh@itu.dk'}),
 ((u'bilbro@gmail.com', u'georgetown-data-analytics@googlegroups.com'),
  {u'georgetown-d