In [23]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

import csv

import datetime as dt
import re

In [2]:
sc = SparkContext()

In [90]:
file = 'segmentid_fiscalyear_dummy.csv'

# to skip header
data = sc.textFile(file)
header = data.first()

res = sc.textFile(file) \
        .filter(lambda x: x != header) \
        .mapPartitions(lambda x: csv.reader(x)) \
        .map(lambda x: ((x[0], dt.datetime.strptime(x[1], '%m/%d/%Y').year), 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .sortByKey(True, 1) \
        .collect()

In [91]:
res

[(('1000', 2018), 2),
 (('1000', 2019), 3),
 (('2000', 2016), 1),
 (('2000', 2017), 2),
 (('3000', 2016), 1),
 (('3000', 2017), 2),
 (('3000', 2018), 3),
 (('4000', 2017), 3),
 (('4000', 2018), 2),
 (('4000', 2019), 1)]

In [85]:
def ols(data):
    """ data = [(x1, y1), ..., (xi, yi), ..., (xN, yN)] """
    x_bar = sum([d[0] for d in data])/len(data)
    y_bar = sum([d[1] for d in data])/len(data)
    numerator = sum([(d[0] - x_bar)*(d[1] - y_bar) for d in data])
    denomenator = sum([(d[0] - x_bar)**2 for d in data])
    return numerator/denomenator

In [88]:
data = [(2015, 100), (2016, 200), (2017, 300), (2018, 400), (2019, 500)]
ols(data)
data = [(2015, 500), (2016, 400), (2017, 300), (2018, 200), (2019, 100)]
ols(data)

-100.0

In [84]:
# return coefficient 
sc.textFile(file) \
  .filter(lambda x: x != header) \
  .mapPartitions(lambda x: csv.reader(x)) \
  .map(lambda x: ((x[0], dt.datetime.strptime(x[1], '%m/%d/%Y').year), 1)) \
  .reduceByKey(lambda x, y: x + y) \
  .sortByKey(True, 1) \
  .map(lambda x: (x[0][0], (x[0][1], x[1]))) \
  .collect()

[('1000', (2018, 2)),
 ('1000', (2019, 3)),
 ('2000', (2016, 1)),
 ('2000', (2017, 2)),
 ('3000', (2016, 1)),
 ('3000', (2017, 2)),
 ('3000', (2018, 3)),
 ('4000', (2017, 3)),
 ('4000', (2018, 2)),
 ('4000', (2019, 1))]

In [92]:
file = 'segmentid_fiscalyear_dummy.csv'

# to skip header
data = sc.textFile(file)
header = data.first()

# return coefficient 
sc.textFile(file) \
  .filter(lambda x: x != header) \
  .mapPartitions(lambda x: csv.reader(x)) \
  .map(lambda x: ((x[0], dt.datetime.strptime(x[1], '%m/%d/%Y').year), 1)) \
  .reduceByKey(lambda x, y: x + y) \
  .sortByKey(True, 1) \
  .map(lambda x: (x[0][0], [(x[0][1], x[1])])) \
  .reduceByKey(lambda x, y: x + y) \
  .mapValues(lambda x: ols(x)) \
  .collect()

[('1000', 1.0), ('2000', 1.0), ('3000', 1.0), ('4000', -1.0)]

In [94]:
file = 'segmentid_fiscalyear_dummy.csv'

# to skip header
data = sc.textFile(file)
header = data.first()

# return coefficient 
sc.textFile(file) \
  .filter(lambda x: x != header) \
  .mapPartitions(lambda x: csv.reader(x)) \
  .map(lambda x: ((x[0], dt.datetime.strptime(x[1], '%m/%d/%Y').year), 1)) \
  .reduceByKey(lambda x, y: x + y) \
  .sortByKey(True, 1) \
  .map(lambda x: (x[0][0], [(x[0][1], x[1])])) \
  .reduceByKey(lambda x, y: x + y) \
  .mapValues(lambda x: x + [('OLS_COEF', ols(x))]) \
  .collect()

[('1000', [(2018, 2), (2019, 3), ('OLS_COEF', 1.0)]),
 ('2000', [(2016, 1), (2017, 2), ('OLS_COEF', 1.0)]),
 ('3000', [(2016, 1), (2017, 2), (2018, 3), ('OLS_COEF', 1.0)]),
 ('4000', [(2017, 3), (2018, 2), (2019, 1), ('OLS_COEF', -1.0)])]

In [98]:
def fill_zer0(row):
    expected = {2015: 0, 2016:0, 2017:0, 2018:0, 2019:0}
    for x in row:
        expected[x[0]] += x[1]
    expected = [(k, v) for k, v in expected.items()]
    return expected

In [99]:
data = [(2018, 2), (2019, 3)]
fill_zer0(data)

[(2015, 0), (2016, 0), (2017, 0), (2018, 2), (2019, 3)]

In [100]:
file = 'segmentid_fiscalyear_dummy.csv'

# to skip header
data = sc.textFile(file)
header = data.first()

# return coefficient 
sc.textFile(file) \
  .filter(lambda x: x != header) \
  .mapPartitions(lambda x: csv.reader(x)) \
  .map(lambda x: ((x[0], dt.datetime.strptime(x[1], '%m/%d/%Y').year), 1)) \
  .reduceByKey(lambda x, y: x + y) \
  .sortByKey(True, 1) \
  .map(lambda x: (x[0][0], [(x[0][1], x[1])])) \
  .reduceByKey(lambda x, y: x + y) \
  .mapValues(lambda x: fill_zer0(x) + [('OLS_COEF', ols(x))]) \
  .collect()

[('1000',
  [(2015, 0), (2016, 0), (2017, 0), (2018, 2), (2019, 3), ('OLS_COEF', 1.0)]),
 ('2000',
  [(2015, 0), (2016, 1), (2017, 2), (2018, 0), (2019, 0), ('OLS_COEF', 1.0)]),
 ('3000',
  [(2015, 0), (2016, 1), (2017, 2), (2018, 3), (2019, 0), ('OLS_COEF', 1.0)]),
 ('4000',
  [(2015, 0), (2016, 0), (2017, 3), (2018, 2), (2019, 1), ('OLS_COEF', -1.0)])]