# CloudFront 性能指标和分布统计演示

该Python代码主要使用了CloudFront日志 / pandas / scipy / matplotlib 进行数据收集 / 整理 / 计算 / 展现，提供了TM99 / PR99 等指标，以及直方图展现

### 引入相关库，访问参数设置

In [None]:
#! /bin/sh

! pip install pandasql
! pip install configparser

import boto3
import time
import sys                                        
import math                                       
import json
import pandas as pd
import pandasql as ps
from configparser import ConfigParser
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter


In [2]:

ak = '<access key>'
sk = '<security key>'


### 通过AWS SDK -- S3/Athena API 收集CloudFront日志

In [None]:

Session_list = []


sql_query_traffic = "\
select time_taken,time_to_first_byte,x_edge_detailed_result_type \
    from \"my_cloudfront_logs\" \
    where year = 2023 and month = 10 and day = 17 and hour = 11 AND host = '<domain name>' \
;"


client_athena = boto3.client(
            'athena', 
            region_name = 'us-east-1',
            aws_access_key_id = ak,
            aws_secret_access_key = sk
          ) 

response = client_athena.start_query_execution(
            QueryString= sql_query_traffic,
            QueryExecutionContext={
                'Database': 'default'
            },
            ResultConfiguration={
                'OutputLocation': 's3://<athena-query-result-folder>/'
                },
            WorkGroup='primary'
          )

QEid = response["QueryExecutionId"]
            

print(QEid)

  


 

In [22]:
time.sleep(240)

client_s3 = boto3.client(
        's3',
        aws_access_key_id = ak,
        aws_secret_access_key = sk,
    )



client_s3.download_file('<athena-query-result-folder>', '%s.csv'%(QEid), '/<local-folder>/perf_1.csv')
  

### 将数据放入dataframe数据结构中

In [None]:
frames = []

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',5000)
pd.set_option('display.max_colwidth', None)
    

result_raw = pd.read_csv('/<local-folder>/perf_1.csv')
result_raw.head(10)

### 通过scipy进行指标计算

In [65]:
result = result_raw
n = result.count().time_to_first_byte

df_tm99 = stats.mstats.trimmed_mean(result['time_to_first_byte'],limits=(0,0.01)).round(3)
df_avg = stats.mstats.trimmed_mean(result['time_to_first_byte'],limits=(0,0)).round(3)
df_p10 = stats.scoreatpercentile(result['time_to_first_byte'], 10)
df_p50 = stats.scoreatpercentile(result['time_to_first_byte'], 50)
df_p90 = stats.scoreatpercentile(result['time_to_first_byte'], 90)
df_p99 = stats.scoreatpercentile(result['time_to_first_byte'], 99)
df_pr1 = stats.percentileofscore(result['time_to_first_byte'], 1, kind='strict')

dfs = {'p10': df_p10, 'p50': df_p50,'p90': df_p90,'p99': df_p99,'avg': df_avg,'tm99': df_tm99, 'PR(:1s)':df_pr1,'count': n}
df = pd.DataFrame(dfs, index=['metric'])
print(df)

### 通过matplot绘制直方图

In [None]:
data = result['time_to_first_byte']*1000
data.hist(bins=25, grid=False, figsize=(20,10), color='#86bf91', zorder=2, rwidth=0.9, range=[0,500],xlabelsize=15,ylabelsize=15,legend='true',weights=np.ones(len(result)) / len(result))
plt.xticks(range(0, 500, 20))
plt.title('fbl distribution')
plt.show()

In [None]:
data_miss = result_raw[result_raw['x_edge_detailed_result_type']=='Miss']
data_miss = data_miss['time_to_first_byte']*1000
data_miss.hist(bins=25, grid=False, figsize=(20,10), color='#86bf91', zorder=2, rwidth=0.9, range=[0,1000],xlabelsize=15,ylabelsize=15,legend='true',weights=np.ones(len(data_miss)) / len(data_miss))
plt.xticks(range(0, 1000, 50))
plt.title('fbl_Miss distribution')
plt.show()