# 4.2b - Export PCA table

In [3]:
import os
import sys
import datetime
import tensorflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict

import models
import report
import block_sampler
import batch_encoder
from dataset import Dataset
from report import Reporter
from trainer import Trainer
from batch_encoder import Dataset
from block_sampler import count_sectors, BlockSamplerByCategory, RandomSampler
from batch_encoder import xs_encoder_8bits_11, BatchEncoder
from filter_random import gen_rndchk_models, evaluate_rnd_model, filter_dataset

import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dense, Activation, TimeDistributed, Flatten, Dot, Softmax, Lambda, RepeatVector, Multiply, Permute, Reshape, BatchNormalization

In [4]:
raw_dataset_folder='govdocs1/sample200'
minimum=200
maximum=200
result_dir = 'results/4.2b-pairs'
os.makedirs(result_dir, exist_ok=True)

# Load results

In [5]:
data = pd.read_csv(result_dir + '/2classes.tsv', sep='\t')
data

Unnamed: 0,cat1,cat2,elapsed,Epochs,val_binary_accuracy,val_categorical_accuracy
0,csv,dbase3,397.032949,16,0.996786,0.996786
1,csv,doc,575.376340,23,0.991786,0.991786
2,csv,dwf,526.062636,21,0.999643,0.999643
3,csv,eps,327.980584,13,0.975714,0.975714
4,csv,f,382.297024,15,0.972143,0.972143
...,...,...,...,...,...,...
373,txt,xls,996.888335,31,0.983214,0.983214
374,txt,xml,1090.040561,38,0.955714,0.955714
375,wp,xls,1226.306003,47,0.941786,0.941786
376,wp,xml,634.880889,18,0.966071,0.966071


In [6]:
data = data.sort_values(['val_categorical_accuracy'])
data[data['val_categorical_accuracy'] < 0.7]

Unnamed: 0,cat1,cat2,elapsed,Epochs,val_binary_accuracy,val_categorical_accuracy
323,pps,ppt,588.003708,12,0.517857,0.517857
176,gz,png,752.804796,23,0.593214,0.593214
90,dwf,png,851.85307,26,0.602143,0.602143
81,dwf,gz,661.326502,24,0.602857,0.602857
318,png,swf,1141.320112,31,0.603571,0.603571
324,pps,pptx,648.752734,14,0.613929,0.613929
97,dwf,swf,743.158201,21,0.618571,0.618571
91,dwf,pps,639.537662,19,0.624286,0.624286
312,png,pps,921.881599,24,0.624286,0.624286
314,png,pptx,816.892612,20,0.630357,0.630357


In [13]:
with open(result_dir + '/nclasses.tsv', 'w') as f:
    f.write('n\tcats\tval_categorical_accuracy\n')
    for i, x in data.iterrows():
        s = '2\t{}\t{}\n'.format(x.cat1+','+x.cat2,x.val_categorical_accuracy)
        f.write(s)

# PCA

In [8]:
cats = set()
for idx, row in data.iterrows():
    cats.add(row.cat1)
    cats.add(row.cat2)

In [9]:
catsnam = dict(enumerate(sorted(cats)))
catsidx = dict([(y,x) for x,y in enumerate(sorted(cats))])

In [10]:
catsnam

{0: 'csv',
 1: 'dbase3',
 2: 'doc',
 3: 'dwf',
 4: 'eps',
 5: 'f',
 6: 'gif',
 7: 'gz',
 8: 'hlp',
 9: 'html',
 10: 'java',
 11: 'jpg',
 12: 'kml',
 13: 'kmz',
 14: 'log',
 15: 'pdf',
 16: 'png',
 17: 'pps',
 18: 'ppt',
 19: 'pptx',
 20: 'ps',
 21: 'rtf',
 22: 'sql',
 23: 'swf',
 24: 'txt',
 25: 'wp',
 26: 'xls',
 27: 'xml'}

In [11]:
catsidx

{'csv': 0,
 'dbase3': 1,
 'doc': 2,
 'dwf': 3,
 'eps': 4,
 'f': 5,
 'gif': 6,
 'gz': 7,
 'hlp': 8,
 'html': 9,
 'java': 10,
 'jpg': 11,
 'kml': 12,
 'kmz': 13,
 'log': 14,
 'pdf': 15,
 'png': 16,
 'pps': 17,
 'ppt': 18,
 'pptx': 19,
 'ps': 20,
 'rtf': 21,
 'sql': 22,
 'swf': 23,
 'txt': 24,
 'wp': 25,
 'xls': 26,
 'xml': 27}

In [23]:
data5 = np.ones((28,28))*0.5

for idx, row in data.iterrows():
    i = catsidx[row.cat1]
    j = catsidx[row.cat2]
    data5[i,j] = row.val_categorical_accuracy
    data5[j,i] = row.val_categorical_accuracy
data5 = pd.DataFrame(data5)
data5.columns = list(catsidx.keys())
data5.rename(catsnam, inplace=True)
data5

Unnamed: 0,csv,dbase3,doc,dwf,eps,f,gif,gz,hlp,html,...,ppt,pptx,ps,rtf,sql,swf,txt,wp,xls,xml
csv,0.5,0.996786,0.991786,0.999643,0.975714,0.972143,0.999286,0.998929,0.977857,0.978571,...,0.996071,0.998214,0.990357,0.986429,0.984286,0.998571,0.921786,0.986786,0.991429,0.976786
dbase3,0.996786,0.5,0.987857,0.995357,0.998929,0.995714,0.998214,0.998929,0.991429,0.985714,...,0.992143,0.997143,0.995,0.9975,0.983571,1.0,0.9975,0.998214,0.987143,0.9975
doc,0.991786,0.987857,0.5,0.9025,0.98,0.972143,0.880357,0.928929,0.971429,0.977857,...,0.860357,0.906071,0.970714,0.987143,0.981786,0.889286,0.980714,0.925,0.858571,0.977143
dwf,0.999643,0.995357,0.9025,0.5,0.996786,0.994286,0.8,0.602857,0.999286,0.998929,...,0.671071,0.652143,0.984286,0.996071,0.988929,0.618571,0.999286,0.986786,0.978214,1.0
eps,0.975714,0.998929,0.98,0.996786,0.5,0.914643,0.995357,0.989286,0.990714,0.984643,...,0.972857,0.991429,0.710714,0.961071,0.958929,0.995,0.975357,0.986786,0.983214,0.98
f,0.972143,0.995714,0.972143,0.994286,0.914643,0.5,0.998571,1.0,0.887857,0.925357,...,0.991429,0.990714,0.956071,0.985357,0.879643,0.999286,0.744286,0.9575,0.9825,0.954643
gif,0.999286,0.998214,0.880357,0.8,0.995357,0.998571,0.5,0.768571,0.998571,1.0,...,0.778929,0.766429,0.994286,0.997143,0.993929,0.826071,0.999643,0.983929,0.974643,0.996786
gz,0.998929,0.998929,0.928929,0.602857,0.989286,1.0,0.768571,0.5,0.999643,0.998571,...,0.7125,0.657857,0.985,0.989643,1.0,0.648929,1.0,0.993571,0.994643,0.999286
hlp,0.977857,0.991429,0.971429,0.999286,0.990714,0.887857,0.998571,0.999643,0.5,0.925714,...,0.989643,0.988571,0.992857,0.991786,0.948929,0.998929,0.885714,0.971071,0.974286,0.964643
html,0.978571,0.985714,0.977857,0.998929,0.984643,0.925357,1.0,0.998571,0.925714,0.5,...,0.994286,0.998571,0.9775,0.984643,0.936429,0.999286,0.921071,0.960357,0.98,0.850357


In [30]:
data5.to_csv(result_dir + '/pca-table.tsv', sep='\t', float_format='%.2f')

In [27]:
'{:,.2f}'.format

<function str.format>

In [23]:
with open(result_dir + '/pca-table.tsv', 'w') as f:
    f.write('\t' + '\t'.join(catsidx.keys()) + '\n')
    for i, x in data.iterrows():
        s = '2\t{}\t{}\n'.format(x.cat1+','+x.cat2,x.val_categorical_accuracy)
        f.write(s)

Unnamed: 0,csv,dbase3,doc,dwf,eps,f,gif,gz,hlp,html,...,"(24, 26)","(26, 24)","(24, 27)","(27, 24)","(25, 26)","(26, 25)","(25, 27)","(27, 25)","(26, 27)","(27, 26)"
csv,0.5,0.996786,0.991786,0.999643,0.975714,0.972143,0.999286,0.998929,0.977857,0.978571,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
dbase3,0.996786,0.5,0.987857,0.995357,0.998929,0.995714,0.998214,0.998929,0.991429,0.985714,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
doc,0.991786,0.987857,0.5,0.9025,0.98,0.972143,0.880357,0.928929,0.971429,0.977857,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
dwf,0.999643,0.995357,0.9025,0.5,0.996786,0.994286,0.8,0.602857,0.999286,0.998929,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
eps,0.975714,0.998929,0.98,0.996786,0.5,0.914643,0.995357,0.989286,0.990714,0.984643,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
f,0.972143,0.995714,0.972143,0.994286,0.914643,0.5,0.998571,1.0,0.887857,0.925357,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
gif,0.999286,0.998214,0.880357,0.8,0.995357,0.998571,0.5,0.768571,0.998571,1.0,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
gz,0.998929,0.998929,0.928929,0.602857,0.989286,1.0,0.768571,0.5,0.999643,0.998571,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
hlp,0.977857,0.991429,0.971429,0.999286,0.990714,0.887857,0.998571,0.999643,0.5,0.925714,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
html,0.978571,0.985714,0.977857,0.998929,0.984643,0.925357,1.0,0.998571,0.925714,0.5,...,0.983214,0.983214,0.955714,0.955714,0.941786,0.941786,0.966071,0.966071,0.983214,0.983214
