In [None]:
#Performance analysis script
#Author: Alan Leung
#Date: 2018.06.25
#Purpose: Parses JSON data, inserts into dataframes, and uses a reduce function to merge the dataframes based on the timestamp.
#Uses: Generates data for performance analysis with respect to an independent variable (CPU, response, time).

import os
import json
import requests
import subprocess
import argparse
import pandas as pd
from datetime import datetime
from subprocess import call
from urllib.request import urlopen
from functools import reduce

#The node list should contain a mix of nodes from different racks, cages, and architecture types.
node_list = subprocess.call(["command-to-generate-nodes"])
node_list = random.sample(node_list, 50)

#Provide argument parsing for year, month, day and hour
parser = argparse.ArgumentParser()
parser.add_argument("-y", "--year", help="year format in 'YYYY'", type=str, required=True)
parser.add_argument("-m", "--month", help="month format in 'MM'", type=str, required=True)
parser.add_argument("-d", "--day", help="day format in 'DD'", type=str, required=True)
parser.add_argument("-r", "--hour", help="hour format in 'HH'", type=str, required=True)
args = parser.parse_args()

#Assign parsed arguments to variables
from_year, from_month, from_day, from_hour = args.year, args.month, args.day, args.hour

from_time_convert = (datetime(int(from_year), int(from_month), int(from_day), int(from_hour), 0).timestamp())

#Grab data from three days before the release date.
time_convert = str(int(from_time_convert)-(259200))

#Main function for grabbing the data and munging it.
def get_data():
    for node in node_list:
        try:
            #Dict of datasource URLs in graphite
            url_dict = {'url_1':'www.example.com/response_time.json',
                        'url_2':'www.example.com/cpu_time.json',
                        'url_3':'www.example.com/processing_time.json',
                        'url_4':'www.example.com/connection_time.json',
                        'url_5':'www.example.com/request_load.json',
                        'url_6':'www.example.com/cpu_wait.json',
                        'url_7':'www.example.com/cpu_sys.json',
                        'url_8':'www.example.com/cpu_user.json',
            }
            
            #Create a list of dataframes for a lambda function to iterate through and merge based on the timestamp.
            df_list = []
            
            #Iterate through the dict and grab the json data for each metric.
            for k, v in url_dict.items():
                response = urlopen(v).read().decode()
                requests_obj = json.loads(response)[0]['datapoints']
                df_requests = pd.DataFrame(requests_obj).dropna()
                df_requests.columns = [k, 'timestamp']
                df_list.append(df_requests)

            df_merged = reduce(lambda left,right: pd.merge(left, right, on='timestamp'), df_list)

            #Calculates the total cpu from the above metrics.
            df_merged['total-cpu'] = df_merged['cpu_user'] + df_merged['cpu_wait'] + df_merged['cpu_sys']

            df_merged.to_csv(path_or_buf='~/metrics/server-purpose_' + node + '_' + time_convert +'.csv')
            print('Report for ' + node + ' completed.')     

        except Exception as e:
            print(e)

get_data()