In [None]:

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import seaborn as sns


path='result.txt'
result_df = pd.read_csv(path, sep=' - ', header=None, engine='python')
result_df.columns = ['core', 'path', 'size', 'elapsed_time']
result_df=result_df.drop(columns='path')

result_df['core'] = result_df['core'].str.replace('Core: ', '').str.replace('*','17').astype('int')
result_df['size'] = result_df['size'].str.replace('Size: ', '').str.replace(' MB', '').astype('float')
result_df['elapsed_time'] = result_df['elapsed_time'].str.replace('Elapsed time:','').str.replace(' sec', '').astype('float')
cores = result_df['core'].unique()
datasize = result_df['size'].unique()
elapsed_time = result_df['elapsed_time'].values.reshape(set(datasize).__len__(),-1)

# log_datasize
ldatasize = np.log(datasize) 
ylabel = 'log(datasize)' 

# log_elapsedtime
lelapsed_time = np.log(elapsed_time) 
zlabel = 'log(elapsed time(sec))' 

def plot_3dsurface(x,y,z, params):
    fig = plt.figure(figsize=(13,13))
    ax = plt.axes(projection='3d')
    ax.set_xlabel(params['xlabel'])
    ax.set_ylabel(params['ylabel'])
    ax.set_zlabel(params['zlabel']);
    ax.set_title(params['title']);

    X, Y = np.meshgrid(x, y)
    ax.plot_surface(X, Y, z, rstride=1, cstride=1, cmap='viridis', edgecolor='none')
    return plt

def plot_2dsurface(x,y, params):
    plt.figure(figsize=(12,7))
    plt.plot(x,np.transpose(y),'r',linewidth=6.0,)
    plt.xlabel(params['xlabel'])
    plt.ylabel(params['ylabel'])
    plt.title(params['title' ]);
    plt.grid(True) 
    return plt


def coredata (a):
    index=0
    times=[]
    sizes=[]
    for i in result_df['core']:
        if i%a == 0: 
            times.append (result_df['elapsed_time'][index])
            sizes.append (result_df['size'][index])
        index=index+1
    return sizes,times




print("Correlation between elapse time and core  :", result_df['elapsed_time'].corr(result_df['core']))
print("Correlation between elapse time and size :", result_df['elapsed_time'].corr(result_df['size']))


In [None]:
plt.plot(ldatasize,lelapsed_time)

In [None]:

plot_3dsurface(x=cores, y=datasize, z=elapsed_time,
               params={'xlabel':'# core',
                       'ylabel':'datasize',
                       'zlabel':'elapse time',
                       'title':'spark benchmark (datasize vs #cores)'}).savefig('spark benchmark (datasize vs #cores)')

In [None]:

plot_3dsurface(x=cores, y=ldatasize, z=lelapsed_time,
               params={'xlabel':'# core',
                       'ylabel':'log(datasize)',
                       'zlabel':'log(elapse time)',
                       'title':'spark benchmark (datasize vs #cores) with log'}).savefig('spark benchmark (datasize vs #cores) with log')

# speed-core benchmark for given data

## 1 Mb

In [None]:

plot_2dsurface(cores,np.transpose(elapsed_time[0]),
               params={'title':'spark benchmark (datasize=1 Mb vs #cores)',
                       'xlabel':'#core',
                       'ylabel':'elapse time'}).show()
plot_2dsurface(cores,np.transpose(lelapsed_time[0]),
               params={'title':'spark benchmark (datasize=1 Mb vs #cores) with log',
                       'xlabel':'#core',
                       'ylabel':'log(elapse time)'}).show()


## 10 Mb

In [None]:

plot_2dsurface(cores,np.transpose(elapsed_time[1]),
               params={'title':'spark benchmark (datasize=10 Mb vs #cores)',
                       'xlabel':'#core',
                       'ylabel':'elapse time'}).show()
plot_2dsurface(cores,np.transpose(lelapsed_time[1]),
               params={'title':'spark benchmark (datasize=10 Mb vs #cores) with log',
                       'xlabel':'#core',
                       'ylabel':'log(elapse time)'}).show()

## 100 Mb

In [None]:

plot_2dsurface(cores,np.transpose(elapsed_time[2]),
               params={'title':'spark benchmark (datasize=100 Mb vs #cores)',
                       'xlabel':'#core',
                       'ylabel':'elapse time'}).show()
plot_2dsurface(cores,np.transpose(lelapsed_time[2]),
               params={'title':'spark benchmark (datasize=100 Mb vs #cores) with log',
                       'xlabel':'#core',
                       'ylabel':'log(elapse time)'}).show()

## 1k Mb

In [None]:
plot_2dsurface(cores,np.transpose(elapsed_time[3]),
               params={'title':'spark benchmark (datasize=1k Mb vs #cores)',
                       'xlabel':'#core',
                       'ylabel':'elapse time'}).show()
plot_2dsurface(cores,np.transpose(lelapsed_time[3]),
               params={'title':'spark benchmark (datasize=1k Mb vs #cores) with log',
                       'xlabel':'#core',
                       'ylabel':'log(elapse time)'}).show()

# 10k Mb

In [None]:
plot_2dsurface(cores,np.transpose(elapsed_time[4]),
               params={'title':'spark benchmark (datasize=10k Mb vs #cores)',
                       'xlabel':'#core',
                       'ylabel':'elapse time'}).show()

In [None]:
x=['1','10','100','1k','10k']
for a in range (0,6):
    plot_2dsurface(cores,np.transpose(elapsed_time[a]),
               params={'title':f'spark benchmark (datasize={x[a]} Mb vs #cores)',
                       'xlabel':'#core',
                       'ylabel':'elapse time'}).savefig(f'spark benchmark (datasize={x[a]} Mb vs #cores')
    plot_2dsurface(cores,np.transpose(lelapsed_time[a]),
               params={'title':f'spark benchmark (datasize={x[a]} Mb vs #cores) with log',
                       'xlabel':'#core',
                       'ylabel':'log(elapse time)'}).savefig(f'spark benchmark (datasize={x[a]} Mb vs #cores with log')

# speed-data benchmark for given core

In [None]:
for a in range (1,9):
    x,y=coredata(a)
    plot_2dsurface(x,y,
               params={'title':f'spark benchmark ({a} core vs #datasizes)',
                       'xlabel':'data size',
                       'ylabel':'elapse time'}).show()
    
    
    plot_2dsurface(np.log(x),np.log(y),
               params={'title': f'spark benchmark ({a} core vs #datasizes) with log',
                       'xlabel':'data size ',
                       'ylabel':'log(elapse time)'}).show()

In [None]:
for a in range (1,9):
    x,y=coredata(a)
    plot_2dsurface(x,y,
               params={'title':f'spark benchmark ({a} core vs #datasizes)',
                       'xlabel':'data size',
                       'ylabel':'elapse time'}).savefig(f'spark benchmark ({a} core vs #datasizes')
    plot_2dsurface(np.log(x),np.log(y),
               params={'title': f'spark benchmark ({a} core vs #datasizes) with log',
                       'xlabel':'data size ',
                       'ylabel':'log(elapse time)'}).savefig(f'spark benchmark ({a} core vs #datasizes with log')

# #speed up