In [1]:
import mapreduce
import optimizers
import pandas as pd
from timeit import timeit

In [2]:
xml_file_path = 'raw_data/posts.xml'
xml_root = mapreduce.get_single_xml(xml_file_path)

In [3]:
# Tarjeta nro 1
# Top 10 tipo de post sin respuestas aceptadas (tag - AcceptedAnswerId - AnswerCount)
t1 = mapreduce.mapreduce_tasker_from_root(xml_root = xml_root, map_root = mapreduce.mapper_task_1, reduce = mapreduce.reducer_task_1)
pd.DataFrame.from_dict(t1, orient = 'index', columns = ['Count']).reset_index().rename(columns = {'index' : 'Tag'})

Unnamed: 0,Tag,Count
0,discussion,2916
1,feature-request,2815
2,bug,1396
3,support,1261
4,stackoverflow,848
5,status-completed,647
6,tags,524
7,reputation,427
8,area51,372
9,questions,354


In [4]:
# Tarjeta nro 2
# Relación entre cantidad de respuestas de un post y su puntaje.
t2 = mapreduce.mapreduce_tasker_from_root(xml_root = xml_root, map_root = mapreduce.mapper_task_2, reduce = mapreduce.reducer_task_2)
t2

1.254095319555108

In [5]:
# Tarjeta nro 3
# Top 10 preguntas que tuvieron mayor tiempo de actividad
t3 = mapreduce.mapreduce_tasker_from_root(xml_root = xml_root, map_root = mapreduce.mapper_task_3, reduce = mapreduce.reducer_task_3)
pd.DataFrame(t3, columns = ['Post Id', 'Days'])

Unnamed: 0,Post Id,Days
0,9508,806
1,7931,802
2,12362,785
3,7046,779
4,11602,774
5,14656,770
6,19470,769
7,10582,763
8,17853,762
9,8211,760


In [6]:
# Chunkify Functions with Optimization
# Tarjeta 1
c1 = optimizers.chunkerizer_from_root(xml_root = xml_root, mapper_task = mapreduce._mapper_task_1, reducer_task = mapreduce.reducer_task_1, N = 20)
pd.DataFrame.from_dict(c1, orient = 'index', columns = ['Count']).reset_index().rename(columns = {'index' : 'Tag'})


Unnamed: 0,Tag,Count
0,discussion,2916
1,feature-request,2815
2,bug,1396
3,support,1261
4,stackoverflow,848
5,status-completed,647
6,tags,524
7,reputation,427
8,area51,372
9,questions,354


In [7]:
# tarjeta 2
c2 = optimizers.chunkerizer_from_root(xml_root = xml_root, mapper_task = mapreduce._mapper_task_2, reducer_task = mapreduce.reducer_task_2, N = 20)
c2

1.254095319555108

In [8]:
# Tarjeta 3
c3 = optimizers.chunkerizer_from_root(xml_root = xml_root, mapper_task = mapreduce._mapper_task_3, reducer_task = mapreduce.reducer_task_3, N = 20)
pd.DataFrame(c3, columns = ['Post Id', 'Days'])

Unnamed: 0,Post Id,Days
0,9508,806
1,7931,802
2,12362,785
3,7046,779
4,11602,774
5,14656,770
6,19470,769
7,10582,763
8,17853,762
9,8211,760


In [26]:
number = 10
task = 3
optimized = 'optimizers.chunkerizer_from_root(xml_root = xml_root, mapper_task = mapreduce._mapper_task_{task}, reducer_task = mapreduce.reducer_task_{task}, N = 10)'
regular = 'mapreduce.mapreduce_tasker_from_root(xml_root = xml_root, map_root = mapreduce.mapper_task_{task}, reduce = mapreduce.reducer_task_{task})'
optimized1 = timeit(optimized.format(task = task), number = number, globals = globals())
regular1 = timeit(regular.format(task = task), number = number, globals = globals())
print(f'Chunked: {round(optimized1, 2)} sec.\nRegular: {round(regular1, 2)} sec.\nRatio: {round(optimized1/regular1, 2)}')

Chunked: 0.43 sec.
Regular: 0.22 sec.
Ratio: 1.95
