In [145]:
''' OT301-105
COMO: Analista de datos
QUIERO: Utilizar MapReduce
PARA: Analizar los datos de StackOverflow

Criterios de aceptación: 

Top 10 fechas con mayor cantidad de post creados

Relación entre cantidad de respuestas y sus visitas.

 Del ranking de los primeros 0-100 por score, tomar el tiempo de respuesta promedio e informar un único valor.'''

In [None]:
from functools import reduce
from typing import Counter
import xml.etree.ElementTree as ET
import re
from multiprocessing import Pool
import os
from os import remove
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
import operator
import pandas as pd
import csv

In [146]:
# global variables and functions for the notebook
root_path = os.path.abspath('').replace('notebooks','datasets')
outputs_path = os.path.abspath('').replace('notebooks','outputs')
file_xml = os.path.join(root_path, 'posts.xml')
tree = ET.parse(file_xml)
root = tree.getroot()

# splits into iterable subsets to process separately
def chunckify(iterable, len_of_chunk):
    for i in range(0, len(iterable), len_of_chunk):
        yield iterable[i:i + len_of_chunk]

# unifies many lists of tuples into only one
def unify_list(data1,data2):
    for data in data2:
        data1.append(data)
    return data1

# Function that generates all external files generated by the project
def outputs(data, name,title):
    file_name = os.path.join(outputs_path, name)
    try:
        remove(file_name)
    except:
        pass
    if name[-4:] == '.csv':
        with open(file_name, 'w') as file:
            writer = csv.writer(file)
            writer.writerow(title)
            if type(data) == list:
                writer.writerows(data)
            elif type(data) == dict:
                for key,value in data.items():
                    writer.writerow([key,value])
    elif name[-4:] == '.txt':
        with open(file_name, 'w') as file:
            file.write(title)
            file.write(str(data))

In [147]:
# functions corresponding only to the first analysis
# returns the information needed for analysis
def get_date_and_type(data):
    if data.attrib['PostTypeId'] == '1':
        return data.attrib['CreationDate'][0:10]
    else:
        return

# function create a map that stores the creation dates and the creation dates for each batch of fragments
def mapper(data):
    type_date = list(map(get_date_and_type, data))
    type_date = list(filter(None,type_date))
    counter_dates = dict(Counter(type_date))
    return dict(counter_dates)

# accumulates the frequency of occurrence of the creation date and merge in one list
def date_acumulator(data1, data2):
    for key, value in data2.items():
        if key in data1.keys():
            data1[key] += value
        else:
            data1[key] = value
    return data1

In [148]:
# Top 10 dates with the highest number of posts created, I assume that the post is generated with the questions
def top_10_creation_post_dates():
    data_chunks = chunckify(root,50)
    mapped = list(map(mapper, data_chunks))
    reduced_mapped = reduce(date_acumulator, mapped)
    outputs(reduced_mapped, 'GE_E01_creation_date_frecuency.csv',['creation_date', 'frequency'])
    top_10 = Counter(reduced_mapped).most_common(10)
    outputs(top_10, 'GE_E01_top10_creation_date.csv',['creation_date', 'frequency'])

# top_10_creation_post_dates()


In [149]:
# functions corresponding only to the second analysis
# returns the information needed for analysis
def get_answer_visit(data):
    try:
        return int(data.attrib['ViewCount']), int(data.attrib['AnswerCount'])
    except:
        return

# accumulates the number of visits and responses for each batch of chunk
def mapper_answer_visit(data):
    answer_visit_data = list(map(get_answer_visit,data))
    answer_visit_data = list(filter(None,answer_visit_data))
    return answer_visit_data

# plots on the cartesian plane the values ​​of the number of visits (x-axis) versus the number of responses to a question
# the function allows parameterizing a visit limit
def print_cartesian_plane(mapped_list, top_visits = 100000, save=False):
    if top_visits == 100000:
        plt.title("Answers and Visits Relationship")
    else:
        plt.title(f"Answers and Visits Relationship, visit limit {top_visits}")
    plt.xlabel("Visits")
    plt.ylabel("Answers")
    for point in mapped_list:
        if point[0]< top_visits:
            plt.plot(point[0],point[1],marker=".", color="red")
    if save:
        if top_visits == 100000:
            file_name = f'{outputs_path}/GE_E02_av_relationship.png'
        else:
            file_name = f'{outputs_path}/GE_E02_av_relationship_top{top_visits}.png'
        try:
            remove(file_name)
        except:
            pass
        plt.savefig(file_name)
    plt.show()
    plt.close()

In [150]:
# Analize the answer vs visit relationship
def answer_visit_relationship():
    data_chunks = chunckify(root,50)
    mapped = list(map(mapper_answer_visit, data_chunks))
    mapped_list = reduce(unify_list,mapped)
    outputs(mapped_list, 'GE_E02_visits_answers_quantity.csv',['visits', 'answers'])
    print_cartesian_plane(mapped_list,1000, save=True)
    
# answer_visit_relationship()

In [151]:
# functions corresponding only to the third analysis
# returns the questions information needed for analysis
def get_question_data(data):
    try:
        return int(data.attrib['Score']), data.attrib['Id'], data.attrib['AcceptedAnswerId'], data.attrib['CreationDate']
    except:
        return

# function create a map that stores the questions information for each batch of fragments
def get_questions_map(data):
    questions_map = map(get_question_data,data)
    return list(filter(None,questions_map))

# returns the answers information needed for analysis
def get_answer_data(data):
    try:
        dato = data.attrib['Id'], data.attrib['ParentId'],data.attrib['CreationDate']
        if dato[0] in answer_list:
            return dato
        else:
            return
    except:
        return

# function create a map that stores the answers information for each batch of fragments
def get_answers_map(data):
    answers_map = map(get_answer_data,data)
    return list(filter(None,answers_map))

# merge the question and answer lists into a single list
def merged(question,answer):
    id_a = operator.itemgetter(0)
    answ_info = {id_a(post_id): post_id[1:] for post_id in answer}
    id_q = operator.itemgetter(2)
    # ques_info = {id_q(post_id): (post_id[0],post_id[1],post_id[3]) for post_id in question}
    merged = [quest_id + answ_info[id_q(quest_id)] for quest_id in question if id_q(quest_id) in answ_info]
    return merged

In [152]:
# Del ranking de los primeros 0-100 por score, tomar el tiempo de respuesta promedio e informar un único valor.
# Returns the necessary information for the analysis of the questions between two chosen scores (by default it is from 0 to 100)
def list_top100_scored_questions(begin=0,end=100):
    data_chunks = chunckify(root,50)
    questions_map = list(map(get_questions_map, data_chunks))
    questions_list = reduce(unify_list, questions_map)
    questions_list = sorted(questions_list, key=operator.itemgetter(0), reverse=True)
    outputs(questions_list, 'GE_E03_top100_scored_questions.csv',['Score','QuestionId','AcceptedAnswerId','CreationDate'])
    return questions_list[begin:end]

# returns the average response according to the selected data
def average_response_time():
    data_chunks = chunckify(root,50)
    answer_data_list = list(map(get_answers_map, data_chunks))
    answer_data_list = reduce(unify_list, answer_data_list)
    merged_list = merged(top100_scored_questions,answer_data_list)

    date_format='%Y-%m-%dT%H:%M:%S.%f'
    questions_dates_string=[x[3] for x in merged_list]
    questions_dates = [datetime.strptime(date, date_format) for date in questions_dates_string]

    answers_dates_string=[x[5] for x in merged_list]
    answers_dates=[datetime.strptime(date, date_format) for date in answers_dates_string]
    
    operator_dif=list(map(operator.sub,answers_dates,questions_dates))

    total = sum(operator_dif, timedelta())
    total_add_time = (total.days + total.seconds / 3600)
    average_response_time_in_days = total_add_time / 100
    outputs(average_response_time_in_days, 'GE_E03_average_response_time_in_days.txt','Average response time in days: ')
    print(average_response_time_in_days)

# top100_scored_questions = list_top100_scored_questions()
# answer_list = [x for w, v, x, y in top100_scored_questions]
# average_response_time()

34.2491
