In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("demo").getOrCreate()

In [None]:
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/1297906913.RANLY.NEI/test_data_Lottery_Powerball_Winning_Numbers__Beginning_2010.csv")

In [None]:
df1.printSchema()

In [None]:
def map_numbers_from_data(row):
  lst_key_value_pairs = []
  for number in row["Winning Numbers"].split():
    key_value = (number, 1)
    lst_key_value_pairs.append(key_value)
  return lst_key_value_pairs

In [None]:
rdd_map = df1.rdd.flatMap(map_numbers_from_data)
rdd_map.collect()

In [None]:
rdd_reduced_results_v2 = rdd_map.reduceByKey(lambda v1,v2 : (v1+v2))
rdd_reduced_results_v2.collect()

In [None]:
# this demonstrates the MapReduce map operation as two seperate apache spark commands
rdd2 = df1.rdd.flatMap(lambda record: record["Winning Numbers"].split())
# the first value in the tuples created perform the role of the key
rdd_key_value_pairs = rdd2.map(lambda number : (number,1))

In [None]:
# Reducing the key-value pairs to get the final result
rdd_reduced_results = rdd_key_value_pairs.reduceByKey(lambda v1,v2 : (v1+v2))
rdd_reduced_results.collect()

In [None]:
df1.head()

In [None]:
# This demonstrates a MapReduce operation, counting the number of winning numbers by month number
rdd_month_number = df1.rdd.flatMap(lambda record: list([ (record["Draw Date"][:2],number) for number in record["Winning Numbers"].split()]))
rdd_month_number.collect()

In [None]:
# To prepare for reduction, a transformation converts the value to a dict type
rdd_keydate_value_pairs = rdd_month_number.map(lambda record_mn : (record_mn[0],{record_mn[1]:1}))
rdd_keydate_value_pairs.collect()

In [None]:
def reduce_dicts(d1, d2):
  # Merges the dictionaries objects, adding any values from overlapping keys
  d_new = {}
  
  for k in d1:
    d_new[k] = d1[k]
    
  for k in d2:
    if k in d_new:
      d_new[k] = d_new[k] + d2[k]
    else:
      d_new[k] = d2[k]
  
  return d_new

In [None]:
# a reduce dicts methods is used to merge the dictionary elements and add the values in the dictionary
rdd_reduced_datenumber_results = rdd_keydate_value_pairs.reduceByKey(reduce_dicts)
rdd_reduced_datenumber_results.collect()

In [None]:
def map_operation(record):
  print(type(record))
  key_value_pairs = []
  
  for number in record["Winning Numbers"].split():
    key = record["Draw Date"][-4:]
    value = {number:1}
    key_value_pair = (key, value)
    key_value_pairs.append(key_value_pair)
  
  return key_value_pairs

In [None]:
# Count the number of winning numbers by year
rdd_year_number = df1.rdd.flatMap(map_operation)
rdd_year_number.collect()

In [None]:
rdd_reduced_yearnumber_results = rdd_year_number.reduceByKey(reduce_dicts)
rdd_reduced_yearnumber_results.collect()