# Chi-squared Test
The chi-squared test is used to determine whether there is a statistically significant difference between the expected frequencies and the observed frequencies in one or more categories of a contingency table. We are using this test to compare the proportions of A and B results from our tests against different network types.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

%load_ext lab_black

In [3]:
# Load all networks
NETWORKKINDS = [
    "complete",
    "cycle",
    "random",
    "star",
    "wattsstrogatz",
    "barabasialbert",
]

results = {}
for kind in NETWORKKINDS:
    results[kind] = pd.read_csv(f"data/{kind}.csv")

## Calculate Proportions

In [7]:
# Dataframe to store proportions
proportions = pd.DataFrame()

KINDS = ["complete", "cycle", "star", "wattsstrogatz", "barabasialbert"]

for kind in KINDS:
    # Group the data
    pr = (
        results[kind]
        .groupby(["density", "size", "trials", "epsilon", "action"])
        .count()
    )

    # Seperate A/B Rows
    # pr = pr.fillna(0).rename_axis(None, axis=1).T.drop_duplicates().T.reset_index()

    # A/B proportions on single row
    pr = (
        pr.unstack()
        .fillna(0)
        .droplevel(0, axis=1)
        .reset_index()
        .rename_axis(None, axis=1)
        .T.drop_duplicates()
        .T
    )
    
    # Convert the types columns to integers because they were output as floats
    pr["A"] = pr["A"].astype(int)
    pr["B"] = pr["B"].astype(int)
    pr["size"] = pr["size"].astype(int)
    pr["trials"] = pr["trials"].astype(int)

    # Add the network type column
    pr["network"] = kind

    # Append to results dataframe
    proportions = pd.concat([proportions, pr], ignore_index=True)

# Drop the ? column generated by the groupby
proportions = proportions.drop("?", axis=1)

proportions

Unnamed: 0,density,size,trials,epsilon,A,B,network
0,1,2,8,0,59,41,complete
1,1,2,8,0,48,52,complete
2,1,2,8,0,37,63,complete
3,1,2,16,0,47,53,complete
4,1,2,16,0,47,53,complete
...,...,...,...,...,...,...,...
1478,0,4,64,0,112,388,barabasialbert
1479,0,4,256,0,121,379,barabasialbert
1480,0,4,256,0,121,379,barabasialbert
1481,0,4,256,0,92,408,barabasialbert


In [5]:
proportions.to_csv("proportions.csv")

## Chi-squared Test
We are going to use the proportions table to compare between different networks according to the densities at each network size.

In [4]:
# Dataframe to store Chi2 results
chi2_results = pd.DataFrame()

# Networks with single densities
networks = ["complete", "cycle", "star"]

for idx, kind in enumerate(networks):
    network = proportions.query("network == @kind").copy()

    # Group networks by network size
    for size_name, size_group in network.groupby("size"):
        # Group by trails size
        for trials_name, trials_group in size_group.groupby("trials"):
            # Group by epsilon
            for eps_name, eps_group in trials_group.groupby("epsilon"):

                # Get other network types after the current network type, excluding itself
                for test_net_kind in networks[idx + 1 :]:
                    # Get matching data from other networksh
                    test_data = proportions.query(
                        "network == @test_net_kind &"
                        "size == @size_name &"
                        "trials == @trials_name &"
                        "epsilon == @eps_name"
                    ).copy()

                    if not test_data.empty:
                        try:
                            # Calculate Chi2
                            chi2, p, dof, expected = stats.chi2_contingency(
                                [
                                    [eps_group["A"].iloc[0], eps_group["B"].iloc[0]],
                                    [test_data["A"].iloc[0], test_data["B"].iloc[0]],
                                ]
                            )
                        except:
                            p = 0
                            chi2 = 0

                        result_row = pd.DataFrame(
                            {
                                "network": [kind],
                                "size": [size_name],
                                "trials": [trials_name],
                                "epsilon": [eps_name],
                                "density": [eps_group["density"].iloc[0]],
                                "A": [eps_group["A"].iloc[0]],
                                "B": [eps_group["B"].iloc[0]],
                                "compared_network": [test_net_kind],
                                "compared_density": [test_data["density"].iloc[0]],
                                "compared_A": [test_data["A"].iloc[0]],
                                "compared_B": [test_data["B"].iloc[0]],
                                "chi2": [chi2],
                                "pvalue": [p],
                            }
                        )

                        # Append to results dataframe
                        chi2_results = pd.concat(
                            [chi2_results, result_row], ignore_index=True
                        )

                # Networks to directly compare (where there are multiple densities per size)
                direct_comparisons = ["wattsstrogatz", "barabasialbert"]

                for direct_net_kind in direct_comparisons:
                    # Compare current network with direct network comparison
                    test_data = proportions.query(
                        "network == @direct_net_kind &"
                        "size == @size_name &"
                        "trials == @trials_name &"
                        "epsilon == @eps_name"
                    ).copy()

                    if not test_data.empty:
                        # For each denisity in the network size run the test
                        for direct_name, direct_group in test_data.groupby("density"):
                            try:
                                # Calculate Chi2
                                chi2, p, dof, expected = stats.chi2_contingency(
                                    [
                                        [
                                            eps_group["A"].iloc[0],
                                            eps_group["B"].iloc[0],
                                        ],
                                        [
                                            direct_group["A"].iloc[0],
                                            direct_group["B"].iloc[0],
                                        ],
                                    ]
                                )
                            except:
                                p = 0
                                chi2 = 0

                            result_row = pd.DataFrame(
                                {
                                    "network": [kind],
                                    "size": [size_name],
                                    "trials": [trials_name],
                                    "epsilon": [eps_name],
                                    "density": [eps_group["density"].iloc[0]],
                                    "A": [eps_group["A"].iloc[0]],
                                    "B": [eps_group["B"].iloc[0]],
                                    "compared_network": [direct_net_kind],
                                    "compared_density": [direct_name],
                                    "compared_A": [direct_group["A"].iloc[0]],
                                    "compared_B": [direct_group["B"].iloc[0]],
                                    "chi2": [chi2],
                                    "pvalue": [p],
                                }
                            )

                            # Append to results dataframe
                            chi2_results = pd.concat(
                                [chi2_results, result_row], ignore_index=True
                            )

# Compare Watts Strogatz with Barabasi Albert Networks at each density
network = proportions.query("network == 'wattsstrogatz'").copy()

# Group networks by network size
for size_name, size_group in network.groupby("size"):
    # Group by trails size
    for trials_name, trials_group in size_group.groupby("trials"):
        # Group by epsilon
        for eps_name, eps_group in trials_group.groupby("epsilon"):
            # Group by density
            for den_name, den_group in eps_group.groupby("density"):
                # Get matching data from Barabasi Albert Networks
                test_data = proportions.query(
                    "network == 'barabasialbert' &"
                    "size == @size_name &"
                    "trials == @trials_name &"
                    "epsilon == @eps_name"
                ).copy()

                if not test_data.empty:
                    # Group test data by density
                    for test_den_name, test_den_group in test_data.groupby("density"):
                        try:
                            # Calculate Chi2
                            chi2, p, dof, expected = stats.chi2_contingency(
                                [
                                    [den_group["A"].iloc[0], den_group["B"].iloc[0]],
                                    [
                                        test_den_group["A"].iloc[0],
                                        test_den_group["B"].iloc[0],
                                    ],
                                ]
                            )
                        except:
                            p = 0
                            chi2 = 0

                        result_row = pd.DataFrame(
                            {
                                "network": ["wattsstrogatz"],
                                "size": [size_name],
                                "trials": [trials_name],
                                "epsilon": [eps_name],
                                "density": [den_name],
                                "A": [den_group["A"].iloc[0]],
                                "B": [den_group["B"].iloc[0]],
                                "compared_network": ["barabasialbert"],
                                "compared_density": [test_den_name],
                                "compared_A": [test_den_group["A"].iloc[0]],
                                "compared_B": [test_den_group["B"].iloc[0]],
                                "chi2": [chi2],
                                "pvalue": [p],
                            }
                        )

                        # Append to results dataframe
                        chi2_results = pd.concat(
                            [chi2_results, result_row], ignore_index=True
                        )

In [78]:
chi2_results.to_csv('chi2_test.csv')

In [75]:
chi2_results.query("pvalue > 0.05")

Unnamed: 0,network,size,trials,epsilon,density,A,B,compared_network,compared_density,compared_A,compared_B,chi2,pvalue
2,complete,2,8,0.010,1.000000,48,52,cycle,1.0000,54,46,0.500200,0.479412
3,complete,2,8,0.010,1.000000,48,52,star,1.0000,51,49,0.080008,0.777287
4,complete,2,8,0.100,1.000000,37,63,cycle,1.0000,37,63,0.000000,1.000000
5,complete,2,8,0.100,1.000000,37,63,star,1.0000,40,60,0.084468,0.771331
6,complete,2,16,0.001,1.000000,47,53,cycle,1.0000,46,54,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,cycle,16,4096,0.001,0.133333,0,100,star,0.1250,1,99,0.000000,1.000000
523,cycle,16,8192,0.001,0.133333,0,100,star,0.1250,1,99,0.000000,1.000000
524,cycle,32,4,0.002,0.064516,0,500,star,0.0625,1,499,0.000000,1.000000
525,cycle,32,64,0.001,0.064516,0,600,star,0.0625,2,598,0.500835,0.479134
