In [1]:
from pathlib import Path
import pandas as pd

In [2]:
def asint(x):
    try:
        return int(x)
    except:
        return 0

gt_file = '../data/out/SemD/13428_2012_278_MOESM1_ESM.csv'
gt = pd.read_csv(gt_file, keep_default_na=False).sort_values('term')
gt = gt.set_index('term')
gt.BNC_contexts = gt.BNC_contexts.map(asint).astype(int)
gt

Unnamed: 0_level_0,mean_cos,SemD,BNC_wordcount,BNC_contexts,BNC_freq,lg_BNC_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aa,0.020000,1.69,577,314,6.6,0.88
aah,0.085245,1.07,92,58,1.1,0.31
aback,0.025864,1.59,294,293,3.4,0.64
abacus,0.022685,1.64,51,40,0.6,0.20
abandon,0.008199,2.09,1257,1193,14.4,1.19
...,...,...,...,...,...,...
zoom,0.051591,1.29,241,161,2.8,0.58
zoomed,0.029300,1.53,64,59,0.7,0.24
zooming,0.023874,1.62,61,54,0.7,0.23
zoos,0.079977,1.10,144,87,1.7,0.42


In [20]:
def semd_correlation(directory, file_name, local_gt=gt):

    directory = Path(directory)
    
    try:
        stats_file = directory / 'entropy_transform.csv'
        stats = pd.read_csv(stats_file, sep='\t', keep_default_na=False)
        stats = stats.sort_values('term')
        stats = stats.rename(columns={
            'context_freq': 'BNC_contexts',
            'corpus_freq': 'BNC_wordcount',
            'freq': 'BNC_freq',
            'log_freq': 'lg_BNC_freq',
        })
        stats = stats.set_index('term')
        stats = stats[[c for c in local_gt.columns if c in stats.columns]]
    except FileNotFoundError:
        print(f"Cannot find {stats_file}")
        stats = None

    semd_file = directory / file_name
    semd = pd.read_csv(semd_file, sep='\t', keep_default_na=False, index_col=0, na_values=['', 'NA'])
    semd.index.name = 'term'
    semd = semd.rename(columns={'semd': 'SemD'})
    semd = semd.sort_index()
    print(semd.dtypes)

    if stats is not None:
        assert len(semd) == len(stats)
        semd = semd.join(stats)

        semd = semd[~semd.isna().any(axis=1)]
    semd = semd[semd.index.isin(local_gt.index)]
    #semd
    semd_out_file = semd_file.with_suffix('.stats_semd')
    semd.to_csv(semd_out_file, float_format='%.3f')

    local_gt = local_gt[local_gt.index.isin(semd.index)]
    
    assert len(semd) == len(local_gt)
    print('vocab size:', len(semd))
    
    corr = semd.corrwith(local_gt, axis=0)
    corr.name = 'correlation'
    return corr


### Correlation between BNC-groundtruth and corpus_w_gt_terms

`corpus_w_gt_terms` was calculated on the following parameters:

**Korpus:**
- corpus: BNC
- chunk_size: 1000
- min-doc-size: 50
- lowercase: True
- #chunks => 113785

**Term-Document-Matrix:**
- vocab: BNC ground truth vocab
- min-contexts: 1
- min-word-freq: 1
- #terms => 30597

**Entropy Normalization:**
- epsilon: 0.0
- log base: 10

**LSI Projection:**
- dimensions: 300
- implementation: sklearn

**SemD calculation:**
- implementation: semsim

In [10]:
semd_correlation(
    '/media/andreas/Raptor/topiclabeling/data/out/SemD/corpus_w_gt_terms',
    'bnc_corpus_w_gt_terms.semd',
    gt,
)

vocab size: 30597


mean_cos         0.153477
SemD             0.208731
BNC_wordcount    0.997916
BNC_contexts     0.995518
BNC_freq         0.997916
lg_BNC_freq      0.988126
Name: correlation, dtype: float64

### Correlation between BNC-groundtruth and bnc_cs1000_lc_filtered

`cbnc_cs1000_lc_filtered` was calculated on the following parameters:

**Korpus:**
- corpus: BNC
- chunk_size: 1000
- min-doc-size: 1
- lowercase: True
- filtered: punct etc.
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: 40
- min-word-freq: 50
- #terms => 

**Entropy Normalization:**
- epsilon: 1.0
- log base: e

**LSI Projection:**
- dimensions: 300
- implementation: gensim

**SemD calculation:**
- implementation: semsim

In [11]:
semd_correlation(
    '/media/andreas/Raptor/topiclabeling/data/out/SemD/bnc_cs1000_lc_filtered',
    'bnc_default.semd',
    gt,
)

vocab size: 31335


mean_cos         0.436173
SemD             0.460770
BNC_wordcount    0.997916
BNC_contexts     0.996900
BNC_freq         0.997916
lg_BNC_freq      0.994125
Name: correlation, dtype: float64

### Correlation between BNC-groundtruth and bnc_cs1000_minsz100_lc_natlog

`cbnc_cs1000_lc_filtered` was calculated on the following parameters:

**Korpus:**
- corpus: BNC
- chunk_size: 1000
- min-doc-size: 100
- lowercase: True
- filtered: none
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: 40
- min-word-freq: 50
- #terms => 42067

**Entropy Normalization:**
- epsilon: 0.0
- log base: e

**LSI Projection:**
- dimensions: 300
- implementation: gensim

**SemD calculation:**
- implementation: semsim

In [12]:
semd_correlation(
    '/media/andreas/Raptor/topiclabeling/data/out/SemD/bnc_cs1000_minsz100_lc_natlog',
    'bnc_default.semd',
    gt,
)

vocab size: 30597


mean_cos         0.157351
SemD             0.211628
BNC_wordcount    0.997917
BNC_contexts     0.995533
BNC_freq         0.997917
lg_BNC_freq      0.988127
Name: correlation, dtype: float64

### Correlation between BNC-groundtruth and Test-data

Hoffman's test data was calculated on the following parameters:

**Korpus:**
- corpus: ?
- chunk_size: ?
- min-doc-size: ?
- lowercase: True
- filtered: ?
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: ?
- min-word-freq: ?
- #terms => 

**Entropy Normalization:**
- epsilon: ?
- log base: ?

**LSI Projection:**
- dimensions: 300
- implementation: ?

**SemD calculation:**
- implementation: hoffman

In [21]:
semd_correlation(
    '/home/andreas/Workspace/github/semsim/tmp/Psycho-Paper/semD',
    'semd_test_values.csv',
    gt,
)

Cannot find /home/andreas/Workspace/github/semsim/tmp/Psycho-Paper/semD/entropy_transform.csv
SemD    float64
dtype: object
vocab size: 31739


SemD             0.71442
BNC_contexts         NaN
BNC_freq             NaN
BNC_wordcount        NaN
lg_BNC_freq          NaN
mean_cos             NaN
Name: correlation, dtype: float64