# Experiment 1 - Results


In [1]:
!pip install plotly > 5.2 -U
!pip list | grep -i plotly
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plotly                        5.4.0


In [2]:
# Mount to Google Drive to save results
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MSc/2020-21/Research\ Project/Colab/
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/MSc/2020-21/Research Project/Colab
 5.2                                       risdal_1250.parquet.yml
 bharadwaj.csv                             risdal_2500.parquet
 domain_count_df.csv                       risdal_2500.parquet.yml
 experiment_1_res_times_only.csv           risdal_5000.parquet
 experiment_1_res_times_stage_1_only.csv   risdal_5000.parquet.yml
 experiment_1_res_times_stage_2_only.csv   risdal_7500.parquet
'fake_news_eda.ipynb - Colaboratory.pdf'   risdal_7500.parquet.yml
 label_count_df.csv                        risdal.csv
 reuse_content_df.csv                      risdal.parquet
 reuse_content_domain_df.csv               risdal.parquet.yml
 reuse_content_url_df.csv                  stage_1_combi_res_df.csv
 risdal_10000.parquet                      stage_2_combi_res_df.csv
 risdal_10000.parquet_10000.yml            tem

In [3]:
# Connect to GCP Bucket
from google.colab import auth
auth.authenticate_user()

In [4]:
# Set GCP project ID and region to Europe West 2 - London
PROJECT = 'fake-news-bs-detector'
!gcloud config set project $PROJECT
REGION = 'europe-west2'
CLUSTER = '{}-cluster'.format(PROJECT)
!gcloud config set compute/region $REGION
!gcloud config list # show some information

Updated property [core/project].
Updated property [compute/region].
[component_manager]
disable_update_check = True
[compute]
gce_metadata_read_timeout_sec = 0
region = europe-west2
[core]
account = aaron.altrock@gmail.com
project = fake-news-bs-detector

Your active configuration is: [default]


## Read in the results in CSV files

In [5]:
# Read in from the Google Drive at mount point
stage_1_res_file_nm = 'experiment_1_res_times_stage_1_only.csv'
stage_1_res_df = pd.read_csv(stage_1_res_file_nm)
print('Stage 1: Dimension of {}: {} x {}'.format(stage_1_res_file_nm, stage_1_res_df.shape[0], stage_1_res_df.shape[1]))

stage_2_res_file_nm = 'experiment_1_res_times_stage_2_only.csv'
stage_2_res_df = pd.read_csv(stage_2_res_file_nm)
print('Stage 2: Dimension of {}: {} x {}'.format(stage_2_res_file_nm, stage_2_res_df.shape[0], stage_2_res_df.shape[1]))

Stage 1: Dimension of experiment_1_res_times_stage_1_only.csv: 25 x 6
Stage 2: Dimension of experiment_1_res_times_stage_2_only.csv: 25 x 6


In [6]:
# Stage 1 
stage_1_res_df

Unnamed: 0,Size,Stage,1a,1b,1c,1d
0,1250,Stage 1,45.64,108.75,324,475
1,1250,Stage 1,34.59,111.45,352,456
2,1250,Stage 1,35.49,107.28,356,458
3,1250,Stage 1,40.17,114.64,352,460
4,1250,Stage 1,36.37,109.3,357,491
5,2500,Stage 1,54.78,179.32,410,461
6,2500,Stage 1,54.74,180.87,366,460
7,2500,Stage 1,56.8,177.27,324,476
8,2500,Stage 1,55.48,184.47,370,441
9,2500,Stage 1,56.78,184.47,338,469


In [7]:
# Stage 2
stage_2_res_df

Unnamed: 0,Size,Stage,1a,1b,1c,1d
0,1250,Stage 2,819.29,1015.82,545,991
1,1250,Stage 2,824.67,1148.38,550,1051
2,1250,Stage 2,828.73,911.12,500,990
3,1250,Stage 2,817.5,883.08,511,1053
4,1250,Stage 2,863.63,893.0,541,996
5,2500,Stage 2,1081.6,1710.55,666,1545
6,2500,Stage 2,1102.6,1698.07,672,1347
7,2500,Stage 2,1117.81,1733.36,645,1312
8,2500,Stage 2,1145.76,1692.81,630,2217
9,2500,Stage 2,1144.45,1723.29,656,1475


In [8]:
# Melt the two data frames into tidy forms for Stage 1 and 2 results
stage_1_res_melted_df = pd.melt(stage_1_res_df, id_vars=['Size', 'Stage'])
stage_2_res_melted_df = pd.melt(stage_2_res_df, id_vars=['Size', 'Stage'])

# Calculate Median points
stage_1_res_q50_df = stage_1_res_melted_df.groupby(['Size', 'Stage', 'variable']).quantile(q=.5).reset_index().sort_values(by=['Stage', 'Size', 'variable'])
stage_2_res_q50_df = stage_2_res_melted_df.groupby(['Size', 'Stage', 'variable']).quantile(q=.5).reset_index().sort_values(by=['Stage', 'Size', 'variable'])

In [9]:
stage_1_res_q50_df

Unnamed: 0,Size,Stage,variable,value
0,1250,Stage 1,1a,36.37
1,1250,Stage 1,1b,109.3
2,1250,Stage 1,1c,352.0
3,1250,Stage 1,1d,460.0
4,2500,Stage 1,1a,55.48
5,2500,Stage 1,1b,180.87
6,2500,Stage 1,1c,366.0
7,2500,Stage 1,1d,461.0
8,5000,Stage 1,1a,93.74
9,5000,Stage 1,1b,316.33


In [10]:
stage_2_res_q50_df

Unnamed: 0,Size,Stage,variable,value
0,1250,Stage 2,1a,824.67
1,1250,Stage 2,1b,911.12
2,1250,Stage 2,1c,541.0
3,1250,Stage 2,1d,996.0
4,2500,Stage 2,1a,1117.81
5,2500,Stage 2,1b,1710.55
6,2500,Stage 2,1c,656.0
7,2500,Stage 2,1d,1475.0
8,5000,Stage 2,1a,2361.9
9,5000,Stage 2,1b,3376.45


In [17]:
# Fit regression by Ordinary Least Squares (OLS)
# Ref https://plotly.com/python/linear-fits/
fig = px.scatter(stage_1_res_melted_df, x='Size', y='value', color='variable', trendline='ols', trendline_options=dict(log_x=False),
                 title='Stage 1 pipeline time taken to process varying batch size', width=800, height=800,
                 labels={
                     'Size': 'batch size (no. of publications)',
                     'value': 'time taken (sec)',
                     'variable': 'configuration'}
                 )
res = px.get_trendline_results(fig)

fig.show()

In [18]:
# Print the regression details
print('Regression results - 1a:')
print(res.query('configuration == \'1a\'').px_fit_results.iloc[0].summary())
print('Regression results - 1b:')
print(res.query('configuration == \'1b\'').px_fit_results.iloc[0].summary())
print('Regression results - 1c:')
print(res.query('configuration == \'1c\'').px_fit_results.iloc[0].summary())
print('Regression results - 1d:')
print(res.query('configuration == \'1d\'').px_fit_results.iloc[0].summary())

Regression results - 1a:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.980
Model:                            OLS   Adj. R-squared:                  0.979
Method:                 Least Squares   F-statistic:                     1115.
Date:                Sat, 20 Nov 2021   Prob (F-statistic):           5.42e-21
Time:                        23:52:43   Log-Likelihood:                -78.512
No. Observations:                  25   AIC:                             161.0
Df Residuals:                      23   BIC:                             163.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         25.7134      

In [19]:
# Fit regression by Ordinary Least Squares (OLS)
fig = px.scatter(stage_2_res_melted_df, x='Size', y='value', color='variable', trendline='ols', 
                 title='Stage 2 pipeline time taken to process varying batch size', width=800, height=800,
                 labels={
                     'Size': 'batch size (no. of publications)',
                     'value': 'time taken (sec)',
                     'variable': 'configuration'
                 })
res = px.get_trendline_results(fig)

fig.show()

In [20]:
# Print the regression details
print('Regression results - 1a:')
print(res.query('configuration == \'1a\'').px_fit_results.iloc[0].summary())
print('Regression results - 1b:')
print(res.query('configuration == \'1b\'').px_fit_results.iloc[0].summary())
print('Regression results - 1c:')
print(res.query('configuration == \'1c\'').px_fit_results.iloc[0].summary())
print('Regression results - 1d:')
print(res.query('configuration == \'1d\'').px_fit_results.iloc[0].summary())

Regression results - 1a:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.994
Method:                 Least Squares   F-statistic:                     3729.
Date:                Sat, 20 Nov 2021   Prob (F-statistic):           5.93e-27
Time:                        23:52:56   Log-Likelihood:                -153.24
No. Observations:                  25   AIC:                             310.5
Df Residuals:                      23   BIC:                             312.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        147.4440     4