In [1]:
import json
from scipy import stats

In [2]:
ALPHA = 0.05

In [3]:
def less_than_reject_null(t, p, alpha):
    if p / 2 < alpha and t < 0:
        return True
    return False

def greater_than_reject_null(t, p, alpha):
    if p / 2 < alpha and t > 0:
        return True
    return False

In [4]:
data = [
    {
        "experiment": "CoNLL CRF vs Greedy",
        "research_hypo": "The CRF is stronger than the greedy tagger",
        "null_hypo": "There is no difference between a CRF and a greedy tagger",
        "exp_1": {
            "note": "CRF",
            "mean": 91.61,
            "std": 0.25,
            "obs": 10,
        },
        "exp_2": {
            "note": "Greedy",
            "mean": 90.28,
            "std": 0.21,
            "obs": 10,
        },
        "test_type": "greater"
    },
    {
        "experiment": "CoNLL CD vs Greedy",
        "research_hypo": "The CD is stronger than the greedy tagger",
        "null_hypo": "There is no difference between a CD and a greedy tagger",
        "exp_1": {
            "note": "CD",
            "mean": 91.44,
            "std": 0.23,
            "obs": 10
        },
        "exp_2": {
            "note": "Greedy",
            "mean": 90.28,
            "std": 0.21,
            "obs": 10,
        },
        "test_type": "greater"
    },
    {
        "experiment": "CoNLL-CRF",
        "research_hypo": "The Constraints+CRF is stronger than unconstrained",
        "null_hypo": "There is no difference between a constrained and unconstrained CRF",
        "exp_1": {
            "note": "Constrained CRF",
            "mean": 91.61,
            "std": 0.25,
            "obs": 10
        },
        "exp_2": {
            "note": "Unconstrained CRF",
            "mean": 91.55,
            "std": 0.26,
            "obs": 10
        },
        "test_type": "greater"
    },
        {
        "experiment": "CoNLL-Convergence",
        "research_hypo": "The Constraints+CRF is faster than unconstrained",
        "null_hypo": "There is no difference between a constrained and unconstrained CRF convergence speed",
        "exp_1": {
            "note": "Constrained CRF",
            "mean": 60.6,
            "std": 23.3,
            "obs": 30
        },
        "exp_2": {
            "note": "Unconstrained CRF",
            "mean": 72.4,
            "std": 21.0,
            "obs": 30
        },
        "test_type": "less"
    },
    {
        "experiment": "CoNLL",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": {
            "note": "CRF",
            "mean": 91.61,
            "std": 0.25,
            "obs": 10
        },
        "exp_2": {
            "note": "CD",
            "mean": 91.44,
            "std": 0.23,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "WNUT-17",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": {
            "note": "CRF",
            "mean": 40.33,
            "std": 1.13,
            "obs": 10
        },
        "exp_2": {
            "note": "CD",
            "mean": 40.59,
            "std": 1.06,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "OntoNotes",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": {
            "note": "CRF",
            "mean": 87.43,
            "std": 0.25,
            "obs": 10
        },
        "exp_2": {
            "note": "CD",
            "mean": 86.13,
            "std": 0.17,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "OntoNotes-Estimate",
        "research_hypo": "The CRF is stronger than Estimated",
        "null_hypo": "There is no difference between the CRF and Estimated",
        "exp_1": {
            "note": "CRF",
            "mean": 87.43,
            "std": 0.25,
            "obs": 10
        },
        "exp_2": {
            "note": "Estimated",
            "mean": 86.21,
            "std": 0.52,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "Snips",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": {
            "note": "CRF",
            "mean": 96.04,
            "std": 0.28,
            "obs": 10
        },
        "exp_2": {
            "note": "CD",
            "mean": 96.07,
            "std": 0.17,
            "obs": 10
        },
        "test_type": "greater"
    },
]

In [5]:
def summary(experiments):
    for exp in experiments:
        t, p = stats.ttest_ind_from_stats(
            exp['exp_1']['mean'],
            exp['exp_1']['std'],
            exp['exp_1']['obs'],
            exp['exp_2']['mean'],
            exp['exp_2']['std'],
            exp['exp_2']['obs'],
            equal_var=False
        )
        if exp['test_type'] == 'greater':
            reject_null = greater_than_reject_null(t, p, ALPHA)
        else:
            reject_null = less_than_reject_null(t, p, ALPHA)
        print(f"Statistically Significant: {exp['experiment']}")
        print(f"\tt: {t}")
        print(f"\tp: {p}")
        if reject_null:
            print(f"\tWe reject the null hypothesis, therefore:\n\t\t \"{exp['research_hypo']}\"")
        else:
            print(f"\tWe cannot reject the null hypothesis, therefore:\n\t\t \"{exp['null_hypo']}\"")  

In [6]:
summary(data)

Statistically Significant: CoNLL CRF vs Greedy
	t: 12.881695785258227
	p: 2.3521523222767477e-10
	We reject the null hypothesis, therefore:
		 "The CRF is stronger than the greedy tagger"
Statistically Significant: CoNLL CD vs Greedy
	t: 11.778015515549948
	p: 7.511387036769597e-10
	We reject the null hypothesis, therefore:
		 "The CD is stronger than the greedy tagger"
Statistically Significant: CoNLL-CRF
	t: 0.5260325302730605
	p: 0.6052966569990543
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between a constrained and unconstrained CRF"
Statistically Significant: CoNLL-Convergence
	t: -2.0604845450988285
	p: 0.04389586065601297
	We reject the null hypothesis, therefore:
		 "The Constraints+CRF is faster than unconstrained"
Statistically Significant: CoNLL
	t: 1.5825083745853075
	p: 0.13106455323984742
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CRF and CD"
Statistically Significant: WNUT-17
	t: -0.530667502

In [7]:
data = [
    {
        "experiment": "WNUT-17",
        "research_hypo": "The CD is stronger than CRF",
        "null_hypo": "There is no difference between the CD and CRF",
        "exp_1": {
            "note": "CD",
            "mean": 40.59,
            "std": 1.06,
            "obs": 10
        },
        "exp_2": {
            "note": "CRF",
            "mean": 40.33,
            "std": 1.13,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "WNUT-17 Estimated",
        "research_hypo": "The Estimated is stronger than CRF",
        "null_hypo": "There is no difference between the Estimated and CRF",
        "exp_1": {
            "note": "Estimated",
            "mean": 40.67,
            "std": 0.91,
            "obs": 10
        },
        "exp_2": {
            "note": "CRF",
            "mean": 40.33,
            "std": 1.13,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "Snips",
        "research_hypo": "The CD is stronger than CRF",
        "null_hypo": "There is no difference between the CD and CRF",
        "exp_1": {
            "note": "CD",
            "mean": 96.07,
            "std": 0.17,
            "obs": 10
        },
        "exp_2": {
            "note": "CRF",
            "mean": 96.04,
            "std": 0.28,
            "obs": 10
        },
        "test_type": "greater"
    },
    {
        "experiment": "Snips Estimated",
        "research_hypo": "The Estimated is stronger than CRF",
        "null_hypo": "There is no difference between the Estimated and CRF",
        "exp_1": {
            "note": "Estimated",
            "mean": 96.15,
            "std": 0.15,
            "obs": 10
        },
        "exp_2": {
            "note": "CRF",
            "mean": 96.04,
            "std": 0.28,
            "obs": 10
        },
        "test_type": "greater"
    },
]

In [8]:
summary(data)

Statistically Significant: WNUT-17
	t: 0.5306675026169321
	p: 0.6021619679399539
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CD and CRF"
Statistically Significant: WNUT-17 Estimated
	t: 0.7410591380455214
	p: 0.46864519977396
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the Estimated and CRF"
Statistically Significant: Snips
	t: 0.28961522824021435
	p: 0.7761173467554696
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CD and CRF"
Statistically Significant: Snips Estimated
	t: 1.095083163863816
	p: 0.2922615154800935
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the Estimated and CRF"


In [9]:
internal = [
    {
        "experiment": "Generic NER",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": json.load(open("../data/generic-ner-crf.json")),
        "exp_2": json.load(open("../data/generic-ner-cd.json")),
        "test_type": "greater"
    },
    {
        "experiment": "Customer Service",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": json.load(open("../data/customer-service-crf.json")),
        "exp_2": json.load(open("../data/customer-service-cd.json")),
        "test_type": "greater"
    },
    {
        "experiment": "Automotive",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": json.load(open("../data/automotive-crf.json")),
        "exp_2": json.load(open("../data/automotive-cd.json")),
        "test_type": "greater"
    },
    {
        "experiment": "CyberSecurity",
        "research_hypo": "The CRF is stronger than CD",
        "null_hypo": "There is no difference between the CRF and CD",
        "exp_1": json.load(open("../data/cyber-security-crf.json")),
        "exp_2": json.load(open("../data/cyber-security-cd.json")),
        "test_type": "greater"
    },
]

In [10]:
summary(internal)

Statistically Significant: Generic NER
	t: -2.87448627445784
	p: 0.010094852740935426
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CRF and CD"
Statistically Significant: Customer Service
	t: -0.842348324465564
	p: 0.41181676329953576
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CRF and CD"
Statistically Significant: Automotive
	t: 1.1382776818420675
	p: 0.27260959180065747
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CRF and CD"
Statistically Significant: CyberSecurity
	t: -5.6574429838697755
	p: 2.304372564336198e-05
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CRF and CD"


In [11]:
internal = [
    {
        "experiment": "Generic NER",
        "research_hypo": "The CD is stronger than CRF",
        "null_hypo": "There is no difference between the CD and CRF",
        "exp_1": json.load(open("../data/generic-ner-cd.json")),
        "exp_2": json.load(open("../data/generic-ner-crf.json")),
        "test_type": "greater"
    },
    {
        "experiment": "Customer Service",
        "research_hypo": "The CD is stronger than CRF",
        "null_hypo": "There is no difference between the CD and CRF",
        "exp_1": json.load(open("../data/customer-service-cd.json")),
        "exp_2": json.load(open("../data/customer-service-crf.json")),
        "test_type": "greater"
    },
    {
        "experiment": "Automotive",
        "research_hypo": "The CD is stronger than CRF",
        "null_hypo": "There is no difference between the CD and CRF",
        "exp_1": json.load(open("../data/automotive-cd.json")),
        "exp_2": json.load(open("../data/automotive-crf.json")),
        "test_type": "greater"
    },
    {
        "experiment": "CyberSecurity",
        "research_hypo": "The CD is stronger than CRF",
        "null_hypo": "There is no difference between the CD and CRF",
        "exp_1": json.load(open("../data/cyber-security-cd.json")),
        "exp_2": json.load(open("../data/cyber-security-crf.json")),
        "test_type": "greater"
    },
]

In [12]:
summary(internal)

Statistically Significant: Generic NER
	t: 2.87448627445784
	p: 0.010094852740935426
	We reject the null hypothesis, therefore:
		 "The CD is stronger than CRF"
Statistically Significant: Customer Service
	t: 0.842348324465564
	p: 0.41181676329953576
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CD and CRF"
Statistically Significant: Automotive
	t: -1.1382776818420675
	p: 0.27260959180065747
	We cannot reject the null hypothesis, therefore:
		 "There is no difference between the CD and CRF"
Statistically Significant: CyberSecurity
	t: 5.6574429838697755
	p: 2.304372564336198e-05
	We reject the null hypothesis, therefore:
		 "The CD is stronger than CRF"
