In [19]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy.stats as stats

In [20]:
# Main parameters

items_in = 'demo_items_in.csv'
items_out = 'demo_items_grouped.csv'
groups_out = 'demo_groups.csv'

In [21]:
### graphmx_from_test ###
# items_in columns: id, mean, sd, n
# returns 2D numpy array

def graphmx_from_test(df_items_in, test_type, p_crit = 0.05):
    graphmx = np.zeros(shape=(len(df_items_in), len(df_items_in)))
    for i in range(len(df_items_in)):
        for j in range(len(df_items_in)):
            if i >= j:
                continue
            if test_type == 'ttest':
                tt = (df_items_in.iloc[i]['mean']-df_items_in.iloc[j]['mean'])/ \
                    np.sqrt(((((df_items_in.iloc[i]['n']-1)*(df_items_in.iloc[i]['sd']**2))+((df_items_in.iloc[j]['n']-1)*(df_items_in.iloc[j]['sd']**2)))/ \
                    (df_items_in.iloc[i]['n']+df_items_in.iloc[j]['n']-2))*((1/df_items_in.iloc[i]['n'])+(1/df_items_in.iloc[j]['n'])))
                pval = stats.t.sf(np.abs(tt), df_items_in.iloc[i]['n']-1+df_items_in.iloc[j]['n']-1)*2
                graphmx[i][j] = 1 if pval >= p_crit else 0
            elif test_type == 'chisq':
                graphmx[i][j] = 0
            else:
                print("Unidentified significance test requested. Quitting...")
                return -1
    return graphmx + graphmx.T

In [22]:
### rm_redundant_cliques ###
# removes the cliques that are entirely contained in an other clique

def rm_redundant_cliques(cliques):
    for i in range(len(cliques)):
        for j in range(len(cliques)):
            if i >= j:
                continue
            if set(cliques[i]).intersection(set(cliques[j])) == set(cliques[i]):
                cliques[i] = []
                break
        if (i % (int(len(cliques)/100+1))) == 0:
            print(f"Removing redundant cliques... {int((i/len(cliques)*100))} %", end='\r')
    print("Removing redundant cliques... done")
    print("Finishing...", end='\r')
    while cliques.count([]) > 0:
        cliques.remove([])
    print("Finishing... done")
    return cliques

In [23]:
### name_groups ###
# assigns the names of the groups the item belongs to and adds them as a new column to the original data frame

def name_groups(df_items_in, groups):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    l_alphabet = list(alphabet)
    for letter in alphabet:
        l_alphabet.append(letter + letter)
    group_column = [""] * len(df_items_in)
    
    if len(groups) <= len(l_alphabet):
        for index, group in enumerate(groups):
            for element in group:
                group_column[element] += l_alphabet[index] + ","
    else:
        print(f"There aren't enough group names ({len(l_alphabet)}) for the groups ({len(groups)}).")
    df_items_in['groups'] = group_column
    
    return df_items_in

In [24]:
df_items = pd.read_csv(items_in)
df_items.head()

Unnamed: 0,id,mean,sd,n
0,A,1.288684,0.261579,20
1,B,1.378158,0.269093,20
2,C,1.345976,0.294123,20
3,D,1.124563,0.315958,20
4,E,0.64889,0.267577,20


In [25]:
a = graphmx_from_test(df_items[:20], 'ttest', 0.05)
G = nx.Graph(a)
cliques = [c for c in nx.enumerate_all_cliques(G)]

print(f"Number of all cliques: {len(cliques)}")

Number of all cliques: 32


In [26]:
cliques_cleaned = rm_redundant_cliques(cliques)
print(f"Number of non-redundant cliques (groups): {len(cliques_cleaned)}")

Removing redundant cliques... done
Finishing... done
Number of non-redundant cliques (groups): 4


In [27]:
cliques_cleaned = sorted(cliques_cleaned)
df_items_out = name_groups(df_items, cliques_cleaned)
df_items_out.head()

Unnamed: 0,id,mean,sd,n,groups
0,A,1.288684,0.261579,20,"a,b,"
1,B,1.378158,0.269093,20,"a,"
2,C,1.345976,0.294123,20,"a,"
3,D,1.124563,0.315958,20,"b,c,"
4,E,0.64889,0.267577,20,"d,"


In [28]:
df_items_out.to_csv(items_out, index=False)

In [29]:
df_groups = pd.DataFrame(cliques_cleaned, dtype=np.float)

In [30]:
df_groups.to_csv(groups_out)