# Traballando con ficheiro salesdata.txt

## Load data

Load the data in `sales/salesdata.txt` into an RDD:

In [1]:
sales = sc.textFile('sales/salesdata.txt')

In [2]:
sales.take(5)

[u'2023-12-28 10:15\tMondonhedo\tModa hombre\t185.82\tmetalico',
 u'2022-02-26 19:30\tA Corunha\tFarmacia\t113.53\tmastercard',
 u'2022-10-15 19:15\tA Corunha\tModa hombre\t341.95\tvisa',
 u'2021-07-15 12:47\tMonforte\tAutomocion\t239.83\tvisa',
 u'2022-03-09 20:11\tOurense\tCalzado\t52.98\tefectivo']

Remove all the "lines" not having 5 items

In [3]:
salesclean = sales.filter(lambda line: len(line.split('\t')) == 5)

## Count the number of sales per Location

In [4]:
locationsales = salesclean.map(lambda line: (line.split('\t')[1],1))

In [5]:
locationsales.take(5)

[(u'Mondonhedo', 1),
 (u'A Corunha', 1),
 (u'A Corunha', 1),
 (u'Monforte', 1),
 (u'Ourense', 1)]

In [6]:
locationsales.countByKey()

defaultdict(int,
            {u'A Corunha': 1942720,
             u'Arteixo': 1296216,
             u'Burela': 907610,
             u'Cangas': 1035190,
             u'Carballo': 1296115,
             u'Chantada': 776119,
             u'Ferrol': 1296384,
             u'Foz': 646397,
             u'Lugo': 1553603,
             u'Marin': 1038150,
             u'Mondonhedo': 649464,
             u'Monforte': 778220,
             u'Naron': 1295931,
             u'Oleiros': 1295422,
             u'Ourense': 1556454,
             u'Pontevedra': 1554396,
             u'Ribeira': 1295822,
             u'Santiago': 1684775,
             u'Sarria': 906730,
             u'Vigo': 1944029})

## Sum the total of sales per Category

In [7]:
categorysales = salesclean.map(lambda line: (line.split('\t')[2], float(line.split('\t')[3])))

In [8]:
categorysales.take(5)

[(u'Moda hombre', 185.82),
 (u'Farmacia', 113.53),
 (u'Moda hombre', 341.95),
 (u'Automocion', 239.83),
 (u'Calzado', 52.98)]

In [9]:
categorysales.reduceByKey(lambda x,y: x+y).collect()

[(u'Farmacia', 51089890.33999985),
 (u'Moda mujer', 235457019.49999973),
 (u'Juguetes', 84637108.94000024),
 (u'Hogar', 242631841.30000138),
 (u'Alimentacion', 20365369.760000084),
 (u'Calzado', 161829245.08000097),
 (u'Electronica', 1254085342.4099998),
 (u'Deportes', 211918284.97000006),
 (u'Ropa', 168507741.40000033),
 (u'Automocion', 406772224.5799967),
 (u'Jardineria', 99885835.28000014),
 (u'Cine', 12021610.04000001),
 (u'Moda hombre', 196381534.21999964),
 (u'Bebes', 166370972.57999957),
 (u'Musica', 55255247.21999988),
 (u'Cosmetica', 123703986.04999928),
 (u'Ferreteria', 138187345.73000014),
 (u'Libros', 37971892.610000126),
 (u'Complementos', 104204879.70999944),
 (u'Mascotas', 75325389.70999962)]

## Find the minimum sale per date (ordered by date)

In [10]:
daysales = salesclean.map(lambda line: (line.split('\t')[0].split()[0], float(line.split('\t')[3])))

In [11]:
daysales.take(5)

[(u'2023-12-28', 185.82),
 (u'2022-02-26', 113.53),
 (u'2022-10-15', 341.95),
 (u'2021-07-15', 239.83),
 (u'2022-03-09', 52.98)]

In [12]:
mindaysales = daysales.reduceByKey(lambda x,y: x if x<y else y)

In [13]:
mindaysales.sortByKey().take(10)

[(u'2020-01-01', 1.0),
 (u'2020-01-02', 1.17),
 (u'2020-01-03', 1.01),
 (u'2020-01-04', 1.01),
 (u'2020-01-05', 1.04),
 (u'2020-01-06', 1.0),
 (u'2020-01-07', 1.0),
 (u'2020-01-08', 1.05),
 (u'2020-01-09', 0.96),
 (u'2020-01-10', 1.04)]

## Calculate de average of category sales when payment method is cash

In [14]:
cashsales = salesclean.filter(lambda line: 'efectivo' in line or 'metalico' in line or 'cash' in line)

In [15]:
pairsales = cashsales.map(lambda line: (line.split('\t')[2], float(line.split('\t')[3])))

In [16]:
pairsales.take(5)

[(u'Moda hombre', 185.82),
 (u'Calzado', 52.98),
 (u'Ropa', 104.84),
 (u'Calzado', 98.23),
 (u'Complementos', 158.07)]

In [17]:
def sum_pairs(a,b):
    return (a[0]+b[0],a[1]+b[1] )

In [18]:
avgsales = pairsales.map(lambda (cat, pay): (cat, (pay, 1))) \
    .reduceByKey(sum_pairs) \
    .map(lambda (cat, (pay, count)): (cat, pay/count)) \


In [19]:
avgsales.take(5)

[(u'Farmacia', 34.39604048377517),
 (u'Moda mujer', 116.72272775525201),
 (u'Juguetes', 56.128337856866),
 (u'Hogar', 106.83818122230839),
 (u'Alimentacion', 16.383200331587886)]