# Exploring API Umbrella Elasticsearch analytics
This notebook shows some basic techniques to explore analytics data from an API Umbrella Elasticsearch instance.

## Imports
This notebook relies on several libraries. They are imported, and configured where possible, below.

In [None]:
import os
from pprint import pprint

# Data import
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

# Data analysis/exploration
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

# Visualization
import matplotlib.pyplot as plt
plt.figure(figsize=(17,7))

import seaborn as sns
sns.set(style="white", color_codes=True)

%matplotlib inline

## Elasticsearch initialization and query
The data for this notebook originates from an Elasticsearch server. The following code initializes the Elasticsearch client and requests the analytics data.

In [None]:
# Get Elasticsearch URL from environment variable
elasticsearch_url = os.environ['ELASTICSEARCH_URL']

# Create Elasticsearch client
client = Elasticsearch([elasticsearch_url])

# Make sure client can connect to Elasticsearch instance
client.info()

In [None]:
# Create Elasticsearch search instance
search = Search(using=client)

# Get the total number of results
total = search.count()

# Update the search instance to contain all results
search = search[0:total]

# Execute the search
results = search.execute()

## Pandas DataFrame
In order to explore the data, we want to load it into a Pandas DataFrame.

In [None]:
# Convert the search results to a Pandas DataFrame
results_df = json_normalize(results.hits.hits)

# List dataframe columns
for column in results_df.columns:
    print(column)

In [None]:
# Count number of results containing value for each column
results_count_sorted = results_df.count().sort_values(ascending=False).iteritems()

# Print each metric and value
for key, value in results_count_sorted:
    print(key, value)

In [None]:
# Add request_date column by converting request_at column to datetime
results_df['request_date'] = pd.to_datetime(results_df['_source.request_at'], unit='ms')

In [None]:
# Use request_date for dataframe index
results_df.set_index('request_date', inplace=True)

## Derived column(s)
We may want to compute some values, based on existing columns. For example, determining whether requests are successful or failure, based on status code.


*TODO*: add computed field to classify responses based on status class 
- 2xx: 'success'
- 3xx: 'warn'
- 4xx: 'client fail'
- 5xx: 'server fail'

In [None]:
def is_success_function(row):
    """
    Check if row (request) was successful
    Successful means a response with 2xx status
    
    return True if successful False otherwise
    """
    success = (row['_source.response_status'] >= 200 and row['_source.response_status'] < 300)
    
    if success:
        return 'Success'
    else:
        return 'Failure'

In [None]:
# Add 'Success' column to API Logs
# Success is determined by status code, where any status in the 200s is considered successful
results_df['Outcome'] = results_df.apply(is_success_function , axis='columns')

# Visualize

## Status codes
Status code indicates whether request was, more or less, successful or failure.

- 2xx: success
- 3xx: warning
- 4xx: fail (client error)
- 5xx: fail (server error)

In [None]:
# Count response status codes
status_code_value_counts = results_df['_source.response_status'].value_counts()

status_code_value_counts

In [None]:
# Sorted distribution of statuses in the data
api_response_code_distributions_plot = status_code_value_counts.plot('bar', title='Response code counts', figsize=(17, 7))

In [None]:
# Make the plot wider and taller
plt.figure(figsize=(17,7))

# Use seaborn to produce a countplot of response status codes
status_code_counts_plot = sns.countplot(
    x='_source.response_status',
    data=results_df,
    color='teal',
    saturation=0.7)

## Success or failure
Show the proportion of calls that are successful (2xx) or failure (3xx, 4xx, 5xx).

## Proxy overhead
Proxy overhead is a measurement of how much latency the proxy adds to a request.

In [None]:
pd.DataFrame.hist(results_df, column='_source.proxy_overhead', bins=15, figsize=(17, 7))

In [None]:
# Calculate the daily median proxy overhead
daily_median_proxy_overhead = results_df['_source.proxy_overhead'].resample('D').median()

# Fill empty values with zero
daily_median_proxy_overhead_filled = daily_median_proxy_overhead.fillna(0)

# Plot chart and save as variable
daily_median_proxy_overhead_chart = daily_median_proxy_overhead_filled.plot(
    figsize=(17, 7),
    title='Daily median proxy overhead')

# Analysis
Look for patterns and correlations in the data.

In [None]:
# Find correlation between columns in the analytics DataFrame
correlation_matrix = results_df.corr()

# Create mask for upper right half of heatmap
mask = np.zeros_like(correlation_matrix)
mask[np.triu_indices_from(mask)] = True

# Make the plot wider and taller
plt.figure(figsize=(17,7))

# Show correlations as a heatmap with white background
correlation_heatmap = sns.heatmap(correlation_matrix, mask=mask, square=True)

In [None]:
# Show relationship proxy overhead and response content length
proxy_overhead_response_content_length_plot = sns.jointplot(
    data=results_df,
    x='_source.proxy_overhead',
    y='_source.response_content_length',
    kind="reg",
    size=10,
).set_axis_labels("Proxy overhead", "Response content length")

In [None]:
# Show relationship between proxy overhead and response time
proxy_overhead_response_time_plot = sns.jointplot(
    data=results_df,
    x='_source.proxy_overhead',
    y='_source.response_time',
    kind="reg",
    size=10,
).set_axis_labels("Proxy overhead", "Response time")