In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy.stats as stats

In [36]:
# Main parameters

items_in = 'items_in_ttest.xlsx'
items_out = 'items_grouped.csv'
groups_out = 'groups.csv'

In [3]:
### graphmx_from_test ###
# items_in columns: id, mean, sd, n
# returns 2D numpy array

def graphmx_from_test(df_items_in, test_type, p_crit = 0.05):
    graphmx = np.zeros(shape=(len(df_items_in), len(df_items_in)))
    for i in range(len(df_items_in)):
        for j in range(len(df_items_in)):
            if i >= j:
                continue
            if test_type == 'ttest':
                tt = (df_items_in.iloc[i]['mean']-df_items_in.iloc[j]['mean'])/ \
                    np.sqrt(((((df_items_in.iloc[i]['n']-1)*(df_items_in.iloc[i]['sd']**2))+((df_items_in.iloc[j]['n']-1)*(df_items_in.iloc[j]['sd']**2)))/ \
                    (df_items_in.iloc[i]['n']+df_items_in.iloc[j]['n']-2))*((1/df_items_in.iloc[i]['n'])+(1/df_items_in.iloc[j]['n'])))
                pval = stats.t.sf(np.abs(tt), df_items_in.iloc[i]['n']-1+df_items_in.iloc[j]['n']-1)*2
                graphmx[i][j] = 1 if pval >= p_crit else 0
            elif test_type == 'chisq':
                graphmx[i][j] = 0
            else:
                print("Unidentified significance test requested. Quitting...")
                return -1
    return graphmx + graphmx.T

In [12]:
### rm_redundant_cliques ###
# removes the cliques that are entirely contained in an other clique

def rm_redundant_cliques(cliques):
    for i in range(len(cliques)):
        for j in range(len(cliques)):
            if i >= j:
                continue
            if set(cliques[i]).intersection(set(cliques[j])) == set(cliques[i]):
                cliques[i] = []
                break
        if (i % (int(len(cliques)/100+1))) == 0:
            print(f"Removing redundant cliques... {int((i/len(cliques)*100))} %", end='\r')
    print("Removing redundant cliques... done")
    print("Finishing...", end='\r')
    while cliques.count([]) > 0:
        cliques.remove([])
    print("Finishing... done")
    return cliques

In [7]:
### name_groups ###
# assigns the names of the groups the item belongs to and adds them as a new column to the original data frame

def name_groups(df_items_in, groups):
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    l_alphabet = list(alphabet)
    for letter in alphabet:
        l_alphabet.append(letter + letter)
    group_column = [""] * len(df_items_in)
    
    if len(groups) <= len(l_alphabet):
        for index, group in enumerate(groups):
            for element in group:
                group_column[element] += l_alphabet[index] + ","
    else:
        print(f"There aren't enough group names ({len(l_alphabet)}) for the groups ({len(groups)}).")
    df_items_in['groups'] = group_column
    
    return df_items_in

In [8]:
df_items = pd.read_excel(items_in)
df_items.head()

Unnamed: 0,id,mean,sd,n
0,MALE_1-2-3 stars_A_0222,4.47,2.76,118
1,MALE_1-2-3 stars_A_2022,4.8,2.59,118
2,MALE_1-2-3 stars_A_2202,8.95,1.5,118
3,MALE_1-2-3 stars_A_2220,4.18,2.79,118
4,MALE_1-2-3 stars_T_2111,8.08,1.64,118


In [10]:
a = graphmx_from_test(df_items[:20], 'ttest', 0.05)
G = nx.Graph(a)
cliques = [c for c in nx.enumerate_all_cliques(G)]

print(f"Number of all cliques: {len(cliques)}")

Number of all cliques: 95


In [13]:
cliques_cleaned = rm_redundant_cliques(cliques)
print(f"Number of non-redundant cliques (groups): {len(cliques_cleaned)}")

Removing redundant cliques... done
Finishing... done
Number of non-redundant cliques (groups): 12


In [14]:
cliques_cleaned = sorted(cliques_cleaned)
df_items_out = name_groups(df_items, cliques_cleaned)
df_items_out.head()

Unnamed: 0,id,mean,sd,n,Groups
0,MALE_1-2-3 stars_A_0222,4.47,2.76,118,"a,b,c,"
1,MALE_1-2-3 stars_A_2022,4.8,2.59,118,"a,b,d,"
2,MALE_1-2-3 stars_A_2202,8.95,1.5,118,"e,"
3,MALE_1-2-3 stars_A_2220,4.18,2.79,118,"a,c,f,"
4,MALE_1-2-3 stars_T_2111,8.08,1.64,118,"g,"


In [46]:
df_items_out.to_csv(items_out, index=False)

In [32]:
df_groups = pd.DataFrame(cliques_cleaned, dtype=np.float)

In [37]:
df_groups.to_csv(groups_out)