In [1]:
#Aim is to average all the numOfFriends (last column) for rows with same age (3rd column)
#Here we have used "mapValues()" and "reduceByKey()" functions in pyspark.

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext.getOrCreate(conf=conf)

In [3]:
lines = sc.textFile("/FileStore/tables/fakefriends.csv")  #Reads each line from the file

In [4]:
llist = lines.collect()  #Collects the rdd lines
for line in llist[0:5]:
  print(line)              #Looking at the data by printing using for loop

In [5]:
#We need to parse the Age and Number of Friends part from the above data using seperator 'comma'.
#We will define a function that will parse each line and only take the 'age' and 'numFriends' columns.
def parseLine(line):
  fields = line.split(',')  #Splitting each line on comma
  age = int(fields[2])
  numFriends = int(fields[3])
  return(age, numFriends)

In [6]:
rdd = lines.map(parseLine)  #Creating a rdd of age and numOfFriends using the above function
collection = rdd.collect()  #Using collect function for rdd to print the results
for i in collection[0:5]:
  print (i)

In [7]:
#The above results are now in the form of (key, value).
#Now we will total the numOfFriends for the same age

# 1. First we map the values i.e. numOfFriends using 'mapValues' function and transform it into a tuple as it is easy to add
totalByAge = rdd.mapValues(lambda x:(x,1))
for i in totalByAge.collect():
  print (i)

In [8]:
# 2. Next we will add numOfFriends of same age using 'reduceByKey' function
#Rewriting the above code
totalByAge = rdd.mapValues(lambda x:(x,1)).reduceByKey(lambda x,y : (x[0]+y[0], x[1]+y[1]))  
#where 'x' is one row and 'y' is another row with the same age

In [9]:
for i in totalByAge.collect():
  print(i)

In [10]:
#Averaging the numOfFriends by dividing the total sum for each age by total numbers of occurence of each age.
#In the above result 1st row, key is (26) and value is (38,17)
#We need values from only values field i.e. (38,17), therefore we will use 'mapValues' function

averageByAge = totalByAge.mapValues(lambda x: x[0]/x[1])  #Where 'x' is (38,17)

In [11]:
results = averageByAge.collect()
for i in results[0:20]:
  print (i)

In [12]:
#To convert the above averages into integers:
for i in results:
  print(i[0], int(i[1]))