<a href="https://colab.research.google.com/github/caranugent/DE300/blob/main/Map_Reduce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import seaborn as sns
diam = sns.load_dataset('diamonds', cache=True, data_home='dataset/')

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

diamonds = (
    spark.read.format('csv')
    .options(header='true', inferSchema='true')
    .load('dataset/diamonds.csv')
    .cache()
)

In [5]:
diamonds.take(5)

[Row(carat=0.23, cut='Ideal', color='E', clarity='SI2', depth=61.5, table=55.0, price=326, x=3.95, y=3.98, z=2.43),
 Row(carat=0.21, cut='Premium', color='E', clarity='SI1', depth=59.8, table=61.0, price=326, x=3.89, y=3.84, z=2.31),
 Row(carat=0.23, cut='Good', color='E', clarity='VS1', depth=56.9, table=65.0, price=327, x=4.05, y=4.07, z=2.31),
 Row(carat=0.29, cut='Premium', color='I', clarity='VS2', depth=62.4, table=58.0, price=334, x=4.2, y=4.23, z=2.63),
 Row(carat=0.31, cut='Good', color='J', clarity='SI2', depth=63.3, table=58.0, price=335, x=4.34, y=4.35, z=2.75)]

In [7]:
df = (
    diamonds
    .where(diamonds['price'] > 1000)
    .select(['cut', 'color', 'carat', 'clarity', 'price'])
)

In [9]:
df.show(5)

+---------+-----+-----+-------+-----+
|      cut|color|carat|clarity|price|
+---------+-----+-----+-------+-----+
|    Ideal|    E|  0.7|    SI1| 2757|
|     Fair|    E| 0.86|    SI2| 2757|
|    Ideal|    G|  0.7|    VS2| 2757|
|Very Good|    E| 0.71|    VS2| 2759|
|Very Good|    G| 0.78|    SI2| 2759|
+---------+-----+-----+-------+-----+
only showing top 5 rows



In [10]:
predictors = ['cut', 'color', 'carat', 'clarity',]
categorical = set(['cut', 'color', 'clarity'])

In [11]:
response = 'price'

In [16]:
# x transpose x (by row) : returns array of all outer products
  # xi, xj is a vector      (essentially a col X row --> matrix)

def xtx_map(row):
  row = row.asDict()

  # look through predictors in X_i, want to return each item ind.
  for i in predictors:

    (ki, vi) = (i, row[i]) if i not in categorical else (i+"_"+row[i], 1.0)     # return value IF non-categorical ELSE new name will be cut_Ideal

    # loop through predictors in X_iT (second)
    for j in predictors:

      (kj, vj) = (j, row[j]) if j not in categorical else (j+"_"+row[j], 1.0)   # return value IF non-categorical ELSE new name will be cut_Ideal

      vij = vi * vj

      yield((ki, kj), vij)

In [17]:
# x transpose y (by row) : returns array of inner products
  # xi is a vector, yi is a scalar

def xty_map(row):
  row = row.asDict()

  # loop through predictors in X_i, want to return each term ind.
  for j in predictors:

    (kj, vj) = (j, row[j]) if j not in categorical else (j+"_"+row[j], 1.0)     # return value IF non-categorical ELSE new name will be cut_Ideal

    yield (kj, vj * row[response])    # return key (eg, cut_Ideal) and value * yi (row[response])

In [18]:
row = df.take(1)
row[0]

Row(cut='Ideal', color='E', carat=0.7, clarity='SI1', price=2757)

In [19]:
[a for a in xty_map(row[0])]

[('cut_Ideal', 2757.0),
 ('color_E', 2757.0),
 ('carat', 1929.8999999999999),
 ('clarity_SI1', 2757.0)]

In [20]:
[a for a in xtx_map(row[0])]

[(('cut_Ideal', 'cut_Ideal'), 1.0),
 (('cut_Ideal', 'color_E'), 1.0),
 (('cut_Ideal', 'carat'), 0.7),
 (('cut_Ideal', 'clarity_SI1'), 1.0),
 (('color_E', 'cut_Ideal'), 1.0),
 (('color_E', 'color_E'), 1.0),
 (('color_E', 'carat'), 0.7),
 (('color_E', 'clarity_SI1'), 1.0),
 (('carat', 'cut_Ideal'), 0.7),
 (('carat', 'color_E'), 0.7),
 (('carat', 'carat'), 0.48999999999999994),
 (('carat', 'clarity_SI1'), 0.7),
 (('clarity_SI1', 'cut_Ideal'), 1.0),
 (('clarity_SI1', 'color_E'), 1.0),
 (('clarity_SI1', 'carat'), 0.7),
 (('clarity_SI1', 'clarity_SI1'), 1.0)]

**now we have the inner and outer products- can sum over them to get the actual xTy and xTx products (by keys)**

In [21]:
xtx_data = (df.rdd
            .flatMap(xtx_map)                     # tells spark we are applying over each row
            .reduceByKey(lambda a, b: a+b)        # reduces by like keys
            .collect()
            )

In [22]:
xty_data = (df.rdd
            .flatMap(xty_map)                     # tells spark we are applying over each row
            .reduceByKey(lambda a, b: a+b)        # reduces by like keys
            .collect()
            )

In [23]:
xty_data

[('cut_Ideal', 69491685.0),
 ('color_E', 27913897.0),
 ('carat', 259765355.250002),
 ('clarity_SI1', 50141077.0),
 ('cut_Fair', 6931384.0),
 ('clarity_SI2', 45876510.0),
 ('color_G', 42841867.0),
 ('clarity_VS2', 45536589.0),
 ('cut_Very Good', 45996850.0),
 ('cut_Good', 18558296.0),
 ('color_F', 33724244.0),
 ('clarity_VS1', 29642262.0),
 ('cut_Premium', 60868420.0),
 ('color_H', 35873113.0),
 ('color_D', 19990260.0),
 ('color_I', 26838093.0),
 ('clarity_VVS2', 15250346.0),
 ('clarity_VVS1', 8043137.0),
 ('color_J', 14665161.0),
 ('clarity_I1', 2860076.0),
 ('clarity_IF', 4496638.0)]

In [24]:
## one thing that could be useful -->
   # create an index list for where I should put it back according to the index

index = dict(zip([r[0] for r in xty_data], range(len(xty_data))))
p = len(index)

#  arrange the individual elements back into matrices
import numpy as np

XTY = np.zeros((p, 1))
for (k, v) in xty_data:
  XTY[index[k]] = v

XTX = np.zeros((p,p))
for ((k1,k2),v) in xtx_data:
  XTX[index[k1], index[k2]] = v

In [27]:
XTX.shape, XTY.shape

((21, 21), (21, 1))

In [None]:
# actually solve the linear algebra using numpy
beta = np.linalg.solve(XTX, XTY)