In [1]:
from pathlib import Path
import pandas as pd

In [2]:
def asint(x):
    try:
        return int(x)
    except:
        return 0

In [31]:
gt_file = '../data/out/SemD/13428_2012_278_MOESM1_ESM.csv'
gt = pd.read_csv(gt_file, keep_default_na=False).sort_values('term')
gt = gt.set_index('term')
gt.BNC_contexts = gt.BNC_contexts.map(asint).astype(int)
gt = gt[~gt.isna().any(axis=1)]
gt

Unnamed: 0_level_0,mean_cos,SemD,BNC_wordcount,BNC_contexts,BNC_freq,lg_BNC_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aa,0.020000,1.69,577,314,6.6,0.88
aah,0.085245,1.07,92,58,1.1,0.31
aback,0.025864,1.59,294,293,3.4,0.64
abacus,0.022685,1.64,51,40,0.6,0.20
abandon,0.008199,2.09,1257,1193,14.4,1.19
...,...,...,...,...,...,...
zoom,0.051591,1.29,241,161,2.8,0.58
zoomed,0.029300,1.53,64,59,0.7,0.24
zooming,0.023874,1.62,61,54,0.7,0.23
zoos,0.079977,1.10,144,87,1.7,0.42


In [32]:
gt2_file = '../tmp/Psycho-Paper/semD/gt_terms_from_hoffman_new_semd.semd'
gt2 = pd.read_csv(gt2_file, keep_default_na=False, na_values='NA', sep='\t').sort_values('term')
gt2 = gt2.set_index('term')
gt2 = gt2[~gt2.isna().any(axis=1)]
gt2

Unnamed: 0_level_0,mean_cos,SemD
term,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,0.025144,1.599565
aback,0.056398,1.248734
abacus,0.053545,1.271280
abandon,0.013255,1.877634
abandoned,0.011400,1.943106
...,...,...
zoology,0.077217,1.112288
zoom,0.075840,1.120104
zooming,0.088965,1.050780
zoos,0.169562,0.770671


In [39]:
def semd_correlation(directory, file_name, local_gt=gt):

    directory = Path(directory)
    
    try:
        stats_file = directory / 'entropy_transform.csv'
        stats = pd.read_csv(stats_file, sep='\t', keep_default_na=False)
        stats = stats.sort_values('term')
        stats = stats.rename(columns={
            'context_freq': 'BNC_contexts',
            'corpus_freq': 'BNC_wordcount',
            'freq': 'BNC_freq',
            'log_freq': 'lg_BNC_freq',
        })
        stats = stats.set_index('term')
        stats = stats[[c for c in local_gt.columns if c in stats.columns]]
    except FileNotFoundError:
        print(f"Cannot find {stats_file}")
        stats = None

    semd_file = directory / file_name
    semd = pd.read_csv(semd_file, sep='\t', keep_default_na=False, index_col=0, na_values=['', 'NA'])
    print('vocab size:', len(semd))
    semd.index.name = 'term'
    semd = semd.rename(columns={'semd': 'SemD'})
    semd = semd.sort_index()
    # print(semd.dtypes)

    if stats is not None and len(semd) == len(stats):
        semd = semd.join(stats)

    semd = semd[~semd.isna().any(axis=1)]
    semd = semd[semd.index.isin(local_gt.index)]
    #semd
    semd_out_file = semd_file.with_suffix('.stats_semd')
    semd.to_csv(semd_out_file, float_format='%.3f')

    local_gt = local_gt[local_gt.index.isin(semd.index)]
    
    assert len(semd) == len(local_gt)
    print('vocab size:', len(semd))
    display(semd.join(local_gt, rsuffix='_gt'))
    
    corr = semd.corrwith(local_gt, axis=0)
    corr.name = 'correlation'
    return corr


----

### Correlation BNC-groundtruth => BNC-new / Sklearn-LSI / SemSim-SemD

`bnc_w_gt_terms` was calculated on the following parameters:

**Korpus:**
- corpus: BNC
- chunk_size: 1000
- min-doc-size: 50
- lowercase: True
- #chunks => 113785

**Term-Document-Matrix:**
- vocab: BNC ground truth vocab
- min-contexts: 1
- min-word-freq: 1
- #terms => 30597

**Entropy Normalization:**
- epsilon: 0.0
- log-base: 10

**LSI Projection:**
- dimensions: 300
- implementation: sklearn

**SemD calculation:**
- implementation: semsim

**Correlation: 0.46**

In [40]:
semd_correlation(
    '../data/out/SemD/bnc_w_gt_terms',
    'bnc_w_gt_terms.semd',
    gt,
)

vocab size: 30597
vocab size: 29518


Unnamed: 0_level_0,mean_cos,SemD,BNC_wordcount,BNC_contexts,BNC_freq,lg_BNC_freq,mean_cos_gt,SemD_gt,BNC_wordcount_gt,BNC_contexts_gt,BNC_freq_gt,lg_BNC_freq_gt
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
aa,0.613469,0.212207,481,268,5.508624,0.741043,0.020000,1.69,577,314,6.6,0.88
aah,0.652629,0.185333,219,127,2.508084,0.399342,0.085245,1.07,92,58,1.1,0.31
aback,0.751403,0.124127,303,299,3.470089,0.540341,0.025864,1.59,294,293,3.4,0.64
abacus,0.683831,0.165051,53,42,0.606979,-0.216826,0.022685,1.64,51,40,0.6,0.20
abandon,0.626641,0.202982,1294,1226,14.819458,1.170832,0.008199,2.09,1257,1193,14.4,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...
zoom,0.602654,0.219932,259,176,2.966182,0.472198,0.051591,1.29,241,161,2.8,0.58
zoomed,0.647750,0.188592,66,62,0.755861,-0.121558,0.029300,1.53,64,59,0.7,0.24
zooming,0.734565,0.133970,69,62,0.790218,-0.102253,0.023874,1.62,61,54,0.7,0.23
zoos,0.697206,0.156639,116,68,1.328483,0.123356,0.079977,1.10,144,87,1.7,0.42


mean_cos         0.457673
SemD             0.462075
BNC_wordcount    0.997916
BNC_contexts     0.995516
BNC_freq         0.997916
lg_BNC_freq      0.988179
Name: correlation, dtype: float64

In [41]:
semd_correlation(
    '../data/out/SemD/bnc_w_gt_terms',
    'bnc_w_gt_terms.semd',
    gt2,
)

vocab size: 30597
vocab size: 26811


Unnamed: 0_level_0,mean_cos,SemD,mean_cos_gt,SemD_gt
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aa,0.613469,0.212207,0.025144,1.599565
aback,0.751403,0.124127,0.056398,1.248734
abacus,0.683831,0.165051,0.053545,1.271280
abandon,0.626641,0.202982,0.013255,1.877634
abandoned,0.629700,0.200866,0.011400,1.943106
...,...,...,...,...
zoology,0.654296,0.184226,0.077217,1.112288
zoom,0.602654,0.219932,0.075840,1.120104
zooming,0.734565,0.133970,0.088965,1.050780
zoos,0.697206,0.156639,0.169562,0.770671


mean_cos    0.298594
SemD        0.321411
Name: correlation, dtype: float64

-----

### Correlation between Correlation BNC-groundtruth => BNC-new/filtered / Gensim-LSI / SemSim-SemD

`bnc_cs1000_lc_filtered` was calculated on the following parameters:

**Korpus:**
- corpus: BNC
- chunk_size: 1000
- min-doc-size: 1
- lowercase: True
- filtered: punct etc.
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: 40
- min-word-freq: 50
- #terms => 

**Entropy Normalization:**
- epsilon: 1.0
- log base: e

**LSI Projection:**
- dimensions: 300
- implementation: gensim

**SemD calculation:**
- implementation: semsim

**Correlation: 0.43**

In [42]:
semd_correlation(
    '../data/out/SemD/bnc_cs1000_lc_filtered',
    'bnc_cs1000_lc_filtered.semd',
    gt,
)

vocab size: 30597
vocab size: 30566


Unnamed: 0_level_0,mean_cos,SemD,mean_cos_gt,SemD_gt,BNC_wordcount,BNC_contexts,BNC_freq,lg_BNC_freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aa,0.590655,0.228666,0.020000,1.69,577,314,6.6,0.88
aah,0.719699,0.142849,0.085245,1.07,92,58,1.1,0.31
aback,0.720500,0.142366,0.025864,1.59,294,293,3.4,0.64
abacus,0.636651,0.196098,0.022685,1.64,51,40,0.6,0.20
abandon,0.608573,0.215687,0.008199,2.09,1257,1193,14.4,1.19
...,...,...,...,...,...,...,...,...
zoom,0.599449,0.222248,0.051591,1.29,241,161,2.8,0.58
zoomed,0.619235,0.208145,0.029300,1.53,64,59,0.7,0.24
zooming,0.639070,0.194451,0.023874,1.62,61,54,0.7,0.23
zoos,0.657189,0.182310,0.079977,1.10,144,87,1.7,0.42


mean_cos         0.418355
SemD             0.425845
BNC_contexts          NaN
BNC_freq              NaN
BNC_wordcount         NaN
lg_BNC_freq           NaN
Name: correlation, dtype: float64

In [43]:
semd_correlation(
    '../data/out/SemD/bnc_cs1000_lc_filtered',
    'bnc_cs1000_lc_filtered.semd',
    gt2,
)

vocab size: 30597
vocab size: 27371


Unnamed: 0_level_0,mean_cos,SemD,mean_cos_gt,SemD_gt
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aa,0.590655,0.228666,0.025144,1.599565
aback,0.720500,0.142366,0.056398,1.248734
abacus,0.636651,0.196098,0.053545,1.271280
abandon,0.608573,0.215687,0.013255,1.877634
abandoned,0.609739,0.214856,0.011400,1.943106
...,...,...,...,...
zoology,0.639991,0.193826,0.077217,1.112288
zoom,0.599449,0.222248,0.075840,1.120104
zooming,0.639070,0.194451,0.088965,1.050780
zoos,0.657189,0.182310,0.169562,0.770671


mean_cos    0.226680
SemD        0.233791
Name: correlation, dtype: float64

----

### Correlation BNC-groundtruth => BNC-new / gensim-LSI / SemSim-SemD

`bnc_cs1000_minsz100_lc_natlog` was calculated on the following parameters:

**Korpus:**
- corpus: BNC
- chunk_size: 1000
- min-doc-size: 100
- lowercase: True
- filtered: none
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: 40
- min-word-freq: 50
- #terms => 42067

**Entropy Normalization:**
- epsilon: 0.0
- log base: e

**LSI Projection:**
- dimensions: 300
- implementation: gensim

**SemD calculation:**
- implementation: semsim

**Correlation: 0.21**

In [12]:
semd_correlation(
    '../data/out/SemD/bnc_cs1000_minsz100_lc_natlog',
    'bnc_default.semd',
    gt,
)

vocab size: 30597


mean_cos         0.157351
SemD             0.211628
BNC_wordcount    0.997917
BNC_contexts     0.995533
BNC_freq         0.997917
lg_BNC_freq      0.988127
Name: correlation, dtype: float64

----

### Correlation BNC-groundtruth => Hoffman Test-Data

Hoffman's test data was calculated on the following parameters:

**Korpus:**
- corpus: ?
- chunk_size: ?
- min-doc-size: ?
- lowercase: True
- filtered: ?
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: ?
- min-word-freq: ?
- #terms => 

**Entropy Normalization:**
- epsilon: ?
- log base: ?

**LSI Projection:**
- dimensions: 300
- implementation: hoffman?

**SemD calculation:**
- implementation: **hoffman**

**Correlation: 0.71**

In [21]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'semd_test_values.csv',
    gt,
)

Cannot find /home/andreas/Workspace/github/semsim/tmp/Psycho-Paper/semD/entropy_transform.csv
SemD    float64
dtype: object
vocab size: 31739


SemD             0.71442
BNC_contexts         NaN
BNC_freq             NaN
BNC_wordcount        NaN
lg_BNC_freq          NaN
mean_cos             NaN
Name: correlation, dtype: float64

----

### Correlation BNC-groundtruth => Test-data / hoffman-LSI / SemSim-SemD

`semd_from_hoffmann_V_gt_terms` was calculated on the following parameters:

**Korpus:**
- corpus: ?
- chunk_size: ?
- min-doc-size: ?
- lowercase: True
- filtered: ?
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: ?
- min-word-freq: ?
- #terms => 

**Entropy Normalization:**
- epsilon: ?
- log base: ?

**LSI Projection:**
- dimensions: 300
- implementation: hoffman?

**SemD calculation:**
- implementation: **semsim**

**Correlation: 0.65**

In [22]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'semd_from_hoffmann_V_gt_terms.semd',
    gt,
)

Cannot find /home/andreas/Workspace/github/semsim/tmp/Psycho-Paper/semD/entropy_transform.csv
mean_cos    float64
SemD        float64
dtype: object
vocab size: 30597


mean_cos         0.562441
SemD             0.654494
BNC_contexts          NaN
BNC_freq              NaN
BNC_wordcount         NaN
lg_BNC_freq           NaN
Name: correlation, dtype: float64

----

### Correlation BNC-groundtruth => Test-data / sklean-LSI / Hoffman-SemD

`semd_from_hoffmann_V_newlsi_gt_term`

The LSI matrix was recalculated with scikit-learn

**Korpus:**
- corpus: ?
- chunk_size: ?
- min-doc-size: ?
- lowercase: True
- filtered: ?
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: ?
- min-word-freq: ?
- #terms => 

**Entropy Normalization:**
- epsilon: ?
- log base: ?

**LSI Projection:**
- dimensions: 300
- implementation: **sklearn**

**SemD calculation:**
- implementation: hoffman

**Correlation: 0.40**

In [23]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'semd_test_newlsi_values.csv',
    gt,
)

Cannot find /home/andreas/Workspace/github/semsim/tmp/Psycho-Paper/semD/entropy_transform.csv
SemD    float64
dtype: object
vocab size: 31739


SemD             0.407025
BNC_contexts          NaN
BNC_freq              NaN
BNC_wordcount         NaN
lg_BNC_freq           NaN
mean_cos              NaN
Name: correlation, dtype: float64

### Correlation BNC-groundtruth => Test-data / sklean-LSI / Semsim-SemD

`semd_from_funke_V_newlsi_gt_terms`

The LSI matrix was recalculated with scikit-learn

**Korpus:**
- corpus: ?
- chunk_size: ?
- min-doc-size: ?
- lowercase: True
- filtered: ?
- #chunks: 

**Term-Document-Matrix:**
- vocab: none
- min-contexts: ?
- min-word-freq: ?
- #terms => 

**Entropy Normalization:**
- epsilon: ?
- log base: ?

**LSI Projection:**
- dimensions: 300
- implementation: **sklearn**

**SemD calculation:**
- implementation: semsim

**Correlation: 0.44**

In [24]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'semd_from_semsim_V_newlsi_gt_terms.semd',
    gt,
)

Cannot find /home/andreas/Workspace/github/semsim/tmp/Psycho-Paper/semD/entropy_transform.csv
mean_cos    float64
SemD        float64
dtype: object
vocab size: 30597


mean_cos         0.426320
SemD             0.441224
BNC_contexts          NaN
BNC_freq              NaN
BNC_wordcount         NaN
lg_BNC_freq           NaN
Name: correlation, dtype: float64

In [20]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'gt_terms_w_V.semd',
    gt,
)

Cannot find ../tmp/Psycho-Paper/semD/entropy_transform.csv
mean_cos    float64
SemD        float64
dtype: object
vocab size: 30597


mean_cos         0.595585
SemD             0.717737
BNC_contexts          NaN
BNC_freq              NaN
BNC_wordcount         NaN
lg_BNC_freq           NaN
Name: correlation, dtype: float64

In [21]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'gt_terms_from_hoffman_new_semd.semd',
    gt,
)

Cannot find ../tmp/Psycho-Paper/semD/entropy_transform.csv
mean_cos    float64
SemD        float64
dtype: object
vocab size: 31739


mean_cos         0.595958
SemD             0.714420
BNC_contexts          NaN
BNC_freq              NaN
BNC_wordcount         NaN
lg_BNC_freq           NaN
Name: correlation, dtype: float64

In [22]:
semd_correlation(
    '../tmp/Psycho-Paper/semD',
    'gt_terms_w_V.semd',
    gt2,
)

Cannot find ../tmp/Psycho-Paper/semD/entropy_transform.csv
mean_cos    float64
SemD        float64
dtype: object
vocab size: 30597


mean_cos    0.999992
SemD        0.999966
Name: correlation, dtype: float64

----

### Fragen:

- Was ist der Unterschied der Implementierungen: Hoffmann vs Funke? -> resolved
- Warum ist die Korrelation unterschiedlich?
- Kann die Randomisierung eine Rolle spielen?
- Warum ist die Vocab-Größe unterschiedlich? -> Frage unklar
- Sind die NaN values korrekt?