In [1]:
import pandas as pd
import glob
import logging
import os
import numpy as np

In [2]:
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)

In [3]:
def calc_significance(df, COLUMNS, mode):
    BOOTSTRAP_TEMPLATE = "{}/bootstrap_{}_{}.tsv"
    OUT_TEMPLATE = "{}/significant_axes_{}_{}.tsv"
    OUT_TEMPLATE2 = "{}/effect_size_significant_axes_{}_{}.tsv"

    for aspect in df['aspect'].unique():
        for sentiment in ['positive', 'negative']:
            results = []        

            df_actual = df.query('aspect == @aspect and sentiment == @sentiment')
            df_bootstrap = pd.read_csv(BOOTSTRAP_TEMPLATE.format(mode, aspect.replace("/", "-"), sentiment), sep="\t").dropna()

            for axis in COLUMNS:
                actual = np.mean(df_actual[axis], axis=0)
                
                if actual >= 0:
                    significance = sum(df_bootstrap[axis] > actual)/float(N)
                else:
                    significance = sum(df_bootstrap[axis] < actual)/float(N)
#                 significance = sum(abs(df_bootstrap[axis]) > abs(actual))/float(N)
                results.append([axis, actual-np.mean(df_bootstrap[axis], axis=0), significance])


            pd.DataFrame(sorted(results, key=lambda x:x[2]), 
                         columns = ["axis", "diff_a_b", "p"]
                        ).to_csv(OUT_TEMPLATE.format(mode, aspect.replace("/", "-"), sentiment), sep="\t", index=False)

            pd.DataFrame(sorted(results, key=lambda x:abs(x[1]), reverse=True), 
                 columns = ["axis", "diff_a_b", "p"]
                ).query('p <= 0.05').to_csv(OUT_TEMPLATE2.format(mode, aspect.replace("/", "-"), sentiment), sep="\t", index=False)

In [4]:
mode = "average"
N = 1000
df = pd.read_csv("big_table_by_{}.tsv".format(mode), sep="\t").dropna()
COLUMNS = [c for c in df.columns if '(' in c]
len(COLUMNS)

1621

In [5]:
calc_significance(df, COLUMNS, mode)

2021-11-23 21:29:10 numexpr.utils INFO     NumExpr defaulting to 4 threads.


In [6]:
mode = "second_moment"
N = 1000
df = pd.read_csv("big_table_by_{}_with_corpus_mean.tsv".format(mode), sep="\t").dropna()
COLUMNS = [c for c in df.columns if '(' in c]
len(COLUMNS)

1621

In [7]:
calc_significance(df, COLUMNS, mode)