In [1]:
import json, sys, socket, mimetypes
import pandas as pd
import numpy as np

In [2]:
################# Helper Methods ################
def get_domain(request):
    domain = [x['value'] for x in request['headers'] if x['name'] == 'Host']
    if len(domain) == 1:
        return domain[0]
    url = request['url']
    if '/' in url:
        url_split = url.split('/')
        if 'http' in url_split[0]:
            return url_split[2]
        else:
            return url_split[0]
    else:
        return url
#################################################

In [3]:
################# Helper Methods ################
def make_request_str(request_dict):
    method = request_dict['method']
    if method != "GET":
        print("Non-GET method - ",method)
        return 'Invalid'
    url = request_dict['url']
    http_ver = request_dict['httpVersion']
    header_str = ''.join([x['name'] + ": " + x['value'] + "\r\n" for x in request_dict['headers'] if x['name'] != 'Accept-Encoding'])
    request_str = method + " " + url + " " + http_ver + "\r\n" + header_str + "\r\n"
    return bytes(request_str,'utf-8')

def parse_response_header(response_header):
    response_split = response_header.split('\r\n')
    response_dict = {}
    for x in response_split:
        y = x.split(': ')
        if len(y) > 1:
            response_dict.update({y[0]:y[1]})
    return response_dict

def get_file_extension(mime_type):
    ext = mimetypes.guess_extension(mime_type)
    if ext == None:
        mime_split = mime_type.split('/')
        ext = '.' + (mime_split[-1] if mime_split[0] == 'image' or mime_split[0] == 'audio' or mime_split[0] == 'video' else 'txt')
    return ext
#################################################

In [4]:
################ HAR Data Parsing ###############
def har_parser(json_filepath):
    with open(json_filepath) as json_data:
        data = json.load(json_data)

    essential_df_col = ['connection_id','domain','total_time','requestSize','responseStatus','contentSize','request_dict']
    essential_df = pd.DataFrame(columns=essential_df_col)
    timing_df = pd.DataFrame()

    for idx, entry in enumerate(data['log']['entries']):
        t_df = pd.DataFrame(entry['timings'],index=[0])
        connection_id = entry.get('connection',np.nan)
        total_time = sum(t_df[t_df >= 0].iloc[0].fillna(0))
        domain = get_domain(entry['request'])
        requestSize = entry['request']['headersSize'] + entry['request']['bodySize']
        responseStatus = entry['response']['status']
        contentSize = entry['response']['content']['size']
        request_dict = entry['request']
        essential_df.loc[idx] = [connection_id,domain,total_time,requestSize,responseStatus,contentSize,request_dict]
        timing_df = timing_df.append(t_df, ignore_index=True)

    essential_df = essential_df.reindex_axis(essential_df_col + list(essential_df.columns.drop(essential_df_col)), axis=1)
    complete_df = pd.concat([essential_df,timing_df], axis=1)
    return data, essential_df, complete_df
#################################################

In [5]:
################ Domain Analysis ################
def domain_wise_analyser(data,complete_df):
    domain_grouping = complete_df.groupby('domain')
    analysis_df_col =['Domain','# Objects','# Non-Zero Objects','Content Size','# Connections',"Connection Analysis"]
    analysis_df = pd.DataFrame(columns=analysis_df_col)
    for idx, x in enumerate(domain_grouping):
        domain_name = x[0]
        content_size = x[1]['contentSize'].sum()
        object_count = len(x[1])
        non_zero_object_count = (x[1]['contentSize'] > 0).sum()
        connection_count = len(x[1])
        connection_grouping = x[1].groupby('connection_id')
        connection_analysis_df = pd.DataFrame(columns=['Connection ID','# Objects','# Non-Zero Objects','Content Size','DNS'])
        for idy, y in enumerate(connection_grouping):
            connection_analysis_df.loc[idy] = [y[0],len(y[1]),(y[1]['contentSize'] > 0).sum(),y[1]['contentSize'].sum(),list(y[1]['dns'])]
        analysis_df.loc[idx] = [domain_name,object_count,non_zero_object_count,content_size,connection_count,connection_analysis_df]
    return analysis_df

def print_domain_wise_analysis(analysis_df):
    analysis_df_col = analysis_df.columns
    print_domain_analysis = input("\33[93mEnter 'y' to see Domain-wise analysis: \33[0m")
    if print_domain_analysis == 'y':
        print('-------------------------------------------------------')
        for x in range(analysis_df.shape[0]):
            print('\33[91m',analysis_df_col[0],'\33[0m',"\t-\t",'\33[95m',analysis_df.iloc[x][0],'\33[0m')
            print('\33[92m',analysis_df_col[1],'\33[0m',"\t-\t",'\33[94m',analysis_df.iloc[x][1],'\33[0m')
            print('\33[92m',analysis_df_col[2],'\33[0m',"\t-\t",'\33[94m',analysis_df.iloc[x][2],'\33[0m')
            print('\33[92m',analysis_df_col[3],'\33[0m',"-\t",'\33[94m',analysis_df.iloc[x][3],'\33[0m')
            print('\33[92m',analysis_df_col[4],'\33[0m',"-\t",'\33[94m',analysis_df.iloc[x][4],'\33[0m')
            print('\33[93m',analysis_df_col[5],'\33[0m')
            print(analysis_df.iloc[x][5].to_string(index=False))
            print('-------------------------------------------------------')

    print('\33[92mTotal Number of Domains\33[0m - \33[95m',analysis_df.shape[0],'\33[0m')
    print('\33[92mTotal Number of Objects Downloaded\33[0m - \33[95m',analysis_df['# Objects'].sum(),'\33[0m')
    print('\33[92mTotal Number of Non-Zero Objects Downloaded\33[0m - \33[95m',analysis_df['# Non-Zero Objects'].sum(),'\33[0m')
    print('\33[92mTotal Content Downloaded\33[0m - \33[95m{0:.2f} MB\33[0m'.format(analysis_df['Content Size'].sum()/1024**2))
    print('\33[92mPage Load Time\33[0m - \33[95m{0:.3f} s\33[0m'.format(data['log']['pages'][0]['pageTimings']['onLoad']/1000))
#################################################

In [6]:
############ TCP Connection Analysis ############
def connection_wise_analyser(data,complete_df):
    connection_grouping = complete_df.groupby('connection_id')
    tcp_analysis_df_col = ['Connection ID','Connect Time','Average Wait Time','Total Receive Time','Total Content Size','Average Goodput','Maximum Goodput']
    tcp_analysis_df = pd.DataFrame(columns=tcp_analysis_df_col)
    for idx, x in enumerate(connection_grouping):
        connect_time = np.max(x[1]['connect'])
        avg_wait_time = x[1]['wait'].sum()/len(x[1])
        total_receive_time = x[1]['receive'].sum()
        total_content_size = x[1]['contentSize'].sum()
        avg_goodput = 0 if total_content_size == 0 else float('inf') if total_receive_time == 0 else total_content_size/total_receive_time
        max_object_size_idx = np.argmax(x[1]['contentSize'])
        max_goodput = 0 if x[1]['contentSize'][max_object_size_idx] == 0 else float('inf') if x[1]['receive'][max_object_size_idx] == 0 else x[1]['contentSize'][max_object_size_idx]/x[1]['receive'][max_object_size_idx]
        tcp_analysis_df.loc[idx] = [x[0],connect_time,avg_wait_time,total_receive_time,total_content_size,avg_goodput,max_goodput]
    return tcp_analysis_df

def print_connection_wise_analysis(tcp_analysis_df):
    tcp_analysis_df_col = tcp_analysis_df.columns
    print_tcp_analysis = input("\33[93mEnter 'y' to see TCP Connection-wise analysis: \33[0m")
    if print_tcp_analysis == 'y':
        print('-------------------------------------------------------')
        for x in range(tcp_analysis_df.shape[0]):
            print('\33[91m',tcp_analysis_df_col[0],'\33[0m',"\t-\t",'\33[95m',tcp_analysis_df.iloc[x][0],'\33[0m')
            print('\33[92m',tcp_analysis_df_col[1],'\33[0m',"\t\t-\t",'\33[94m',tcp_analysis_df.iloc[x][1],'\33[0m')
            print('\33[92m',tcp_analysis_df_col[2],'\33[0m',"\t-\t",'\33[94m',tcp_analysis_df.iloc[x][2],'\33[0m')
            print('\33[92m',tcp_analysis_df_col[3],'\33[0m',"\t-\t",'\33[94m',tcp_analysis_df.iloc[x][3],'\33[0m')
            print('\33[92m',tcp_analysis_df_col[4],'\33[0m',"\t-\t",'\33[94m',tcp_analysis_df.iloc[x][4],'\33[0m')
            print('\33[92m',tcp_analysis_df_col[5],'\33[0m',"\t-\t",'\33[94m',tcp_analysis_df.iloc[x][5],'\33[0m')
            print('\33[92m',tcp_analysis_df_col[6],'\33[0m',"\t-\t",'\33[94m',tcp_analysis_df.iloc[x][6],'\33[0m')
            print('-------------------------------------------------------')

    total_network_receive_time = tcp_analysis_df['Total Receive Time'].sum()/1000
    total_network_content_size = tcp_analysis_df['Total Content Size'].sum()/1024**2
    print('\33[92mTotal Number of Connections\33[0m - \33[95m',tcp_analysis_df.shape[0],'\33[0m')
    print('\33[92mTotal Content Downloaded\33[0m - \33[95m{0:.2f} MB\33[0m'.format(total_network_content_size))
    print('\33[92mTotal Receive Time\33[0m - \33[95m{0:.2f} s\33[0m'.format(total_network_receive_time))
    # TODO - Check Average Network Goodput Def.
    print('\33[92mAverage Network Goodput\33[0m - \33[95m{0:.2f} MB/s\33[0m'.format(total_network_content_size/total_network_receive_time))
    print('\33[92mMaximum Network Goodput\33[0m - \33[95m{0:.2f} MB/s\33[0m'.format(np.max(tcp_analysis_df['Maximum Goodput'][tcp_analysis_df['Maximum Goodput'] < float('inf')]*(1000/(1024**2)))))
#################################################

In [7]:
###### Testing ######
json_filepath = 'data/HAR Dumps/Indian Express/unthrottled_windows.har'
data, essential_df, complete_df = har_parser(json_filepath)
analysis_df = domain_wise_analyser(data,complete_df)
tcp_analysis_df = connection_wise_analyser(data,complete_df)
print_domain_wise_analysis(analysis_df)
print_connection_wise_analysis(tcp_analysis_df)
#####################

Enter 'y' to see Domain-wise analysis: \
[92mTotal Number of Domains[0m - [95m 71 [0m
[92mTotal Number of Objects Downloaded[0m - [95m 344 [0m
[92mTotal Number of Non-Zero Objects Downloaded[0m - [95m 299 [0m
[92mTotal Content Downloaded[0m - [95m5.47 MB[0m
[92mPage Load Time[0m - [95m11.203 s[0m
Enter 'y' to see TCP Connection-wise analysis: 
[92mTotal Number of Connections[0m - [95m 101 [0m
[92mTotal Content Downloaded[0m - [95m5.47 MB[0m
[92mTotal Receive Time[0m - [95m3.53 s[0m
[92mAverage Network Goodput[0m - [95m1.55 MB/s[0m
[92mMaximum Network Goodput[0m - [95m22.98 MB/s[0m


In [8]:
############# Object Download Method ############
def get_object(sck,request_dict):
    request = make_request_str(request_dict)
    domain = [headers['value'] for headers in request_dict['headers'] if headers['name'] == 'Host']
    if request != "Invalid":
        try:
            sck.connect((domain[0],80))
            sck.send(request)
            response = sck.recv(1024)
            header_end = response.find(bytes("\r\n\r\n",'utf-8'))+len("\r\n\r\n")
            response_header = response[:header_end] if header_end > 3 else response
            content_start = 1 if header_end > 3 else -1
            content = response[header_end:] if content_start == 1 else ""
            while len(response) > 0:
                response = sck.recv(1024)
                if content_start == -1:
                    add_response = response_header + response
                    header_end = add_response.find(bytes("\r\n\r\n",'utf-8'))+len("\r\n\r\n")
                    response_header = add_response[:header_end] if header_end > 3 else add_response
                    content_start = 1 if header_end > 3 else -1
                    content = add_response[header_end:] if content_start == 1 else ""
                else:
                    content = content + response
            return (str(response_header,'utf-8'),content)
        except:
            return("Error occured","")
    else:
        return ("Invalid request encountered","")
#################################################

In [9]:
max_tcp = 4
max_obj = 5
json_filepath = 'data/HAR Dumps/Indian Express/unthrottled_windows.har'
out_folder = 'out/' + json_filepath.split('/')[2] + ' - ' + json_filepath.split('/')[3].split('.')[0] + '/'
domain_grouping = complete_df.groupby('domain')
for idx, x in enumerate(domain_grouping):
    out_folder_domain = out_folder + x[0]
    sorted_data = x[1].sort_values(by='contentSize',axis=0,ascending=True)
    obj_count = len(sorted_data)
    scks = [socket.socket(socket.AF_INET,socket.SOCK_STREAM) for i in range(min(max_tcp,obj_count))]
    if obj_count <= max_tcp * max_obj:
        for idk, sck in enumerate(scks):
            make_request_str(sorted_data.loc[sorted_data.index[idk]]['request_dict'])
    else:
        print("Exceeded")
        

Exceeded
Exceeded
Exceeded
Exceeded


In [10]:
s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
i = 5
req_dict = data['log']['entries'][i]['request']
response_header, content = get_object(s,req_dict)
s.close()

In [11]:
response_dict = parse_response_header(response_header)
_ = [print(key,": ",response_dict[key]) for key in response_dict.keys()]
ext = get_file_extension(response_dict['Content-Type'])
print('Extension: ',ext)
print('Content-Length (received): ',len(content))

Accept-Ranges :  bytes
X-ac :  4.hkg _bur
Cache-Control :  max-age=31536000
Last-Modified :  Tue, 12 Sep 2017 13:23:50 GMT
Connection :  keep-alive
Server :  nginx
Date :  Sun, 24 Sep 2017 15:23:53 GMT
ETag :  "59b7dfe6-2b6b"
Access-Control-Allow-Methods :  GET, HEAD
Vary :  Accept-Encoding
Content-Length :  11115
Content-Type :  image/svg+xml
X-nc :  HIT hkg 32
Access-Control-Allow-Origin :  *
Expires :  Wed, 12 Sep 2018 13:25:53 GMT
Extension:  .svg
Content-Length (received):  11115
