In [1]:
#necessary imports
import pandas as pd
import editdistance
import scipy
import numpy as np

In [2]:
## Reading the data and removing columns that are not important. 
df = pd.read_csv("AppNetSci_ELP_Data.csv", sep = ',', encoding = 'latin-1', usecols = ['Word','I_Zscore','I_Mean_Accuracy','I_NMG_Zscore', 'I_NMG_Mean_Accuracy'])

In [3]:
#conversion and creation of word list to build dictionary set
objNames = df.iloc[:,0]
names = []
for x in range(len(objNames)):
    names.append(str(objNames[x]))
names[0:10]

['zenith',
 'zephyr',
 'zeppelin',
 'zero',
 'zeroed',
 'zeros',
 'zest',
 'zigzag',
 'zigzagging',
 'zimbabwe']

In [4]:
def buildOrthogonalityNetwork(nameList):
    #initializing dictionary
    orthoNetwork = dict(zip(nameList, [None]*len(nameList)))
    #for each name
    for name in nameList:
        for compareName in nameList:
            if editdistance.eval(name,compareName) == 1:
                if orthoNetwork[name] == None:
                    orthoNetwork[name] = [compareName]
                else:
                    orthoNetwork[name] += [compareName]
    return orthoNetwork

In [5]:
#AVOID RUNNNING: takes >30 minutes to process the (43,000)^2 entries with NP-hard edit distance
#Load pickle file in next section if you are interested in raw data
#pickle file represents dict of all words that have at least a single connection
network = buildOrthogonalityNetwork(names)

In [6]:
#verify edges and nodes are approximately what they should be
totalEdges = 0
totalHermits = 0
for word in network.keys():
    if network[word] == None:
        totalHermits += 1
        continue
    totalEdges += len(network[word])
print((len(network.keys()),totalEdges/2, totalHermits/len(network.keys())))

(40468, 41517.0, 0.40748245527330235)


In [7]:
#removing out the hermits
hermitlessDict = {}
for word in network.keys():
    if network[word] != None:
        hermitlessDict[word] = network[word]
len(hermitlessDict.keys())

23978

In [31]:
#saving either hermitless dict
import pickle
# create a binary pickle file 
f = open("hermitlessSimilarDict.pkl","wb")
# write the python object (dict) to pickle file
pickle.dump(hermitlessSimilarDict,f)
# close file
f.close()

In [2]:
#loading either pickle if needed
import pickle
#load pickle file
file = open('hermitlessDict.pkl', 'rb')
#get data back
hermitlessDict = pickle.load(file)
file.close()

In [3]:
#using bfs with a known common word to generate LCC for the undirected graph
LCCDict = {'top':hermitlessDict['top']}
#intializer
queue = hermitlessDict['top'].copy()
#cycle through
while len(queue) > 0:
    #remove first word in list
    potentialWord = queue.pop(0)
    #add to LCC
    LCCDict[potentialWord] = hermitlessDict[potentialWord]
    #find all words connected to this one
    potentialNewWords = hermitlessDict[potentialWord]
    #cycle through those words
    for potentialWord in potentialNewWords:
        #if we haven't added it in yet
        if potentialWord not in LCCDict.keys() and potentialWord not in queue:
            #add to queue
            queue.append(potentialWord)
len(LCCDict.keys())

11363

In [4]:
#finding degree of each node and average degree in LCC
numEdges = 0
for word in LCCDict.keys():
    numEdges += len(LCCDict[word])
numEdges/len(LCCDict.keys())

5.7671389597817475

In [5]:
#finding mean clustering coefficient 
import networkx as nx
nxDict = nx.Graph(LCCDict)

In [6]:
nx.transitivity(nxDict)

0.27377735607763865

In [7]:
#takes a few minutes, avoid if possible
nx.average_shortest_path_length(nxDict)

8.782726505453184

In [22]:
#back to data, grab data from original datasource file
recog = df.iloc[:,0:5]
print(recog)

                Word  I_Zscore  I_Mean_Accuracy  I_NMG_Zscore  \
0             zenith     -0.01             0.79          0.02   
1             zephyr      0.36             0.39          0.15   
2           zeppelin      0.51             0.61          0.29   
3               zero     -0.76             0.97         -0.53   
4             zeroed      0.48             0.76          0.00   
...              ...       ...              ...           ...   
40463  agglutination      1.14             0.29          2.31   
40464     aggravated     -0.10             0.94         -0.17   
40465     aggravates      0.53             0.77         -0.04   
40466    aggravation      0.13             0.91          0.16   
40467      aggregate     -0.08             0.84          0.08   

       I_NMG_Mean_Accuracy  
0                     0.79  
1                     0.69  
2                     0.96  
3                     1.00  
4                     1.00  
...                    ...  
40463           

In [15]:
#make large dataset with [word] x [recog. stats: first level features: ortho measures]
masterOrthoData = np.zeros((len(LCCDict.keys())-1,11))

In [18]:
#get recog stats from dataset
RTrow = 0
row = 0
masterWordList = []
for word in recog['Word']:
    if word in LCCDict.keys():
        #get RT
        masterWordList.append(word)
        RT = recog['I_Zscore'][RTrow]
        Acc = recog['I_Mean_Accuracy'][RTrow]
        RT_Name = recog['I_NMG_Zscore'][RTrow]
        Acc_Name = recog['I_NMG_Mean_Accuracy'][RTrow]
        masterOrthoData[row][0] = RT
        masterOrthoData[row][1] = Acc
        masterOrthoData[row][2] = RT_Name
        masterOrthoData[row][3] = Acc_Name
        row += 1
    RTrow += 1
masterOrthoData

array([[-0.76,  0.97, -0.53, ...,  0.  ,  0.  ,  0.  ],
       [-0.61,  1.  , -0.31, ...,  0.  ,  0.  ,  0.  ],
       [-0.39,  0.94, -0.39, ...,  0.  ,  0.  ,  0.  ],
       ...,
       [-0.54,  1.  , -0.63, ...,  0.  ,  0.  ,  0.  ],
       [ 0.13,  0.59,  0.13, ...,  0.  ,  0.  ,  0.  ],
       [ 0.69,  0.25,  0.44, ...,  0.  ,  0.  ,  0.  ]])

In [26]:
#download first level predictors
## Reading the data and removing columns that are not important. 
firstLevelArray = pd.read_csv("FirstLevelPredictorStorage.csv", sep = ',', encoding = 'latin-1')

In [27]:
#flipping to match other datasets
firstLevelArray = firstLevelArray.iloc[::-1]

In [22]:
#now add in the predictors like we did the RT stats
row = 0
#for each word in established order from before
for word in masterWordList:
    #find row in data set
    nextRow = firstLevelArray.loc[firstLevelArray['Word'] == word]
    #capitalize if necessary and rerun
    if nextRow['Length'].values.size == 0:
        nextRow = firstLevelArray.loc[firstLevelArray['Word'] == word.capitalize()]
    #fill out master
    masterOrthoData[row][4] = nextRow['Length'].values[0]
    masterOrthoData[row][7] = nextRow['Log_Freq_HAL'].values[0]
    if nextRow['NPhon'].values not in ['#']:
        masterOrthoData[row][5] = nextRow['NPhon'].values[0]
        masterOrthoData[row][6] = nextRow['NSyll'].values[0]
    else:
        masterOrthoData[row][5] = 0
        masterOrthoData[row][6] = 0
    row = row + 1
masterOrthoData

array([[-0.76,  0.97, -0.53, ...,  0.  ,  0.  ,  0.  ],
       [-0.61,  1.  , -0.31, ...,  0.  ,  0.  ,  0.  ],
       [-0.39,  0.94, -0.39, ...,  0.  ,  0.  ,  0.  ],
       ...,
       [-0.54,  1.  , -0.63, ...,  0.  ,  0.  ,  0.  ],
       [ 0.13,  0.59,  0.13, ...,  0.  ,  0.  ,  0.  ],
       [ 0.69,  0.25,  0.44, ...,  0.  ,  0.  ,  0.  ]])

In [23]:
#now to generate the ortho network stats
row = 0
for word in masterWordList:
    wordDegree = len(LCCDict[word])
    wordCloseCentrality = nx.closeness_centrality(nxDict,u=word)
    wordClustering = nx.clustering(nxDict,nodes=word)
    masterOrthoData[row][8] = wordDegree
    masterOrthoData[row][9] = wordClustering
    masterOrthoData[row][10] = wordCloseCentrality
    row = row + 1

In [24]:
import statsmodels.api as sm

In [63]:
print(masterOrthoData[0])
print(masterOrthoData[0,4:8])

[-0.76        0.97       -0.53        1.          4.          4.
  2.          9.965       3.          0.33333333  0.1228311 ]
[4.    4.    2.    9.965]


In [65]:
firstLevel = masterOrthoDataCopy[:,5:9].copy()
secondLevel = masterOrthoDataCopy[:,5:12].copy()
firstLevel = sm.tools.tools.add_constant(firstLevel, prepend=True)
secondLevel = sm.tools.tools.add_constant(secondLevel, prepend=True)
for x in range(4):
    mod = sm.OLS(masterOrthoData[:,x],firstLevel)
    res = mod.fit()
    print(res.summary())
    mod = sm.OLS(masterOrthoData[:,x],secondLevel)
    res = mod.fit()
    print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.437
Model:                            OLS   Adj. R-squared:                  0.437
Method:                 Least Squares   F-statistic:                     2202.
Date:                Wed, 12 Apr 2023   Prob (F-statistic):               0.00
Time:                        18:44:04   Log-Likelihood:                -292.07
No. Observations:               11362   AIC:                             594.1
Df Residuals:                   11357   BIC:                             630.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4653      0.016     29.537      0.0

In [4]:
#building gloveDict
glove_dictionary = {}
with open('glove.6B.200d.txt',encoding="utf8") as file:
    for each_message in file:
        words_in_message, coeff_cients = each_message.split(maxsplit=1)
        coeff_cients = np.array(coeff_cients.split(),dtype = float)
        glove_dictionary[words_in_message] = coeff_cients

In [5]:
#assigning all relevant words to their vectorized form in a dict
word_to_glove_dict = {}
for word in names:
    if word in glove_dictionary.keys():
        word_to_glove_dict[word] = glove_dictionary[word]

In [7]:
#cosine similarity tester similar to orthogonality of 1 tester.
#tested ideal threshold of 0.55
similarity_network = {}
for word in word_to_glove_dict.keys():
    neighbors = []
    for otherWord in word_to_glove_dict.keys():
        cosine = np.dot(word_to_glove_dict[word],word_to_glove_dict[otherWord])/(np.linalg.norm(word_to_glove_dict[word])*np.linalg.norm(word_to_glove_dict[otherWord]))
        if cosine > 0.6 and cosine < 0.99:
            neighbors.append(otherWord)
    if neighbors == []:
        similarity_network[word] = []
    else:
        similarity_network[word] = neighbors

In [8]:
#verify edges and nodes are approximately what they should be
totalEdges = 0
totalHermits = 0
for word in similarity_network.keys():
    if similarity_network[word] == []:
        totalHermits += 1
        continue
    totalEdges += len(similarity_network[word])
print(totalEdges/len(similarity_network.keys()), totalHermits/len(similarity_network.keys()))

4.3372667036245085 0.40365882632454264


In [12]:
#removing out the hermits
hermitlessSimilarDict = {}
for word in similarity_network.keys():
    if similarity_network[word] != []:
        hermitlessSimilarDict[word] = similarity_network[word]
len(hermitlessSimilarDict.keys())

22590

In [15]:
#using bfs with a known common word to generate LCC for the undirected graph
LCCSimilarDict = {'top':hermitlessSimilarDict['top']}
#intializer
queue = hermitlessSimilarDict['top'].copy()
#cycle through
while len(queue) > 0:
    #remove first word in list
    potentialWord = queue.pop(0)
    #add to LCC
    LCCSimilarDict[potentialWord] = hermitlessSimilarDict[potentialWord]
    #find all words connected to this one
    potentialNewWords = hermitlessSimilarDict[potentialWord]
    #cycle through those words
    for potentialWord in potentialNewWords:
        #if we haven't added it in yet
        if potentialWord not in LCCSimilarDict.keys() and potentialWord not in queue:
            #add to queue
            queue.append(potentialWord)
len(LCCSimilarDict.keys())

17626

In [16]:
#finding degree of each node and average degree in LCC
numEdges = 0
for word in LCCSimilarDict.keys():
    numEdges += len(LCCSimilarDict[word])
numEdges/len(LCCSimilarDict.keys())

8.921820038579371

In [17]:
#finding mean clustering coefficient 
import networkx as nx
nxSimilarDict = nx.Graph(LCCSimilarDict)

In [18]:
nx.transitivity(nxSimilarDict)

0.43026549484798804

In [20]:
#takes a few minutes, avoid if possible
nx.average_shortest_path_length(nxSimilarDict)

8.24569964583268

In [24]:
#now that we have functioning network we try to build the testing matrix again
masterSimilarData = np.zeros((len(LCCSimilarDict.keys()),11))
RTrow = 0
row = 0
masterWordList = []
for word in recog['Word']:
    if word in LCCSimilarDict.keys():
        #get RT
        masterWordList.append(word)
        RT = recog['I_Zscore'][RTrow]
        Acc = recog['I_Mean_Accuracy'][RTrow]
        RT_Name = recog['I_NMG_Zscore'][RTrow]
        Acc_Name = recog['I_NMG_Mean_Accuracy'][RTrow]
        masterSimilarData[row][0] = RT
        masterSimilarData[row][1] = Acc
        masterSimilarData[row][2] = RT_Name
        masterSimilarData[row][3] = Acc_Name
        row += 1
    RTrow += 1
masterSimilarData

array([[-0.39,  0.94, -0.39, ...,  0.  ,  0.  ,  0.  ],
       [ 0.38,  0.91,  0.27, ...,  0.  ,  0.  ,  0.  ],
       [-0.45,  0.91, -0.21, ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 0.  ,  0.86,  0.1 , ...,  0.  ,  0.  ,  0.  ],
       [-0.1 ,  0.94, -0.17, ...,  0.  ,  0.  ,  0.  ],
       [ 0.53,  0.77, -0.04, ...,  0.  ,  0.  ,  0.  ]])

In [28]:
#first level predictors again
row = 0
#for each word in established order from before
for word in masterWordList:
    #find row in data set
    nextRow = firstLevelArray.loc[firstLevelArray['Word'] == word]
    #capitalize if necessary and rerun
    if nextRow['Length'].values.size == 0:
        nextRow = firstLevelArray.loc[firstLevelArray['Word'] == word.capitalize()]
    #fill out master
    masterSimilarData[row][4] = nextRow['Length'].values[0]
    masterSimilarData[row][7] = nextRow['Log_Freq_HAL'].values[0]
    if nextRow['NPhon'].values not in ['#']:
        masterSimilarData[row][5] = nextRow['NPhon'].values[0]
        masterSimilarData[row][6] = nextRow['NSyll'].values[0]
    else:
        masterSimilarData[row][5] = 0
        masterSimilarData[row][6] = 0
    row = row + 1
masterSimilarData

array([[-0.39,  0.94, -0.39, ...,  0.  ,  0.  ,  0.  ],
       [ 0.38,  0.91,  0.27, ...,  0.  ,  0.  ,  0.  ],
       [-0.45,  0.91, -0.21, ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 0.  ,  0.86,  0.1 , ...,  0.  ,  0.  ,  0.  ],
       [-0.1 ,  0.94, -0.17, ...,  0.  ,  0.  ,  0.  ],
       [ 0.53,  0.77, -0.04, ...,  0.  ,  0.  ,  0.  ]])

In [29]:
#finding semantic network stats
#now to generate the ortho network stats
row = 0
for word in masterWordList[0:20]:
    if word not in similarity_network.keys():
        print(word)
        row = row + 1
        continue
    wordDegree = len(similarity_network[word])
    wordCloseCentrality = nx.closeness_centrality(nxSimilarDict,u=word)
    wordClustering = nx.clustering(nxSimilarDict,nodes=word)
    masterSimilarData[row][8] = wordDegree
    masterSimilarData[row][9] = wordClustering
    masterSimilarData[row][10] = wordCloseCentrality
    row = row + 1
masterSimilarData

array([[-0.39      ,  0.94      , -0.39      , ...,  4.        ,
         0.5       ,  0.10663077],
       [ 0.38      ,  0.91      ,  0.27      , ...,  7.        ,
         0.52380952,  0.1333631 ],
       [-0.45      ,  0.91      , -0.21      , ...,  7.        ,
         0.47619048,  0.1079031 ],
       ...,
       [ 0.        ,  0.86      ,  0.1       , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1       ,  0.94      , -0.17      , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.53      ,  0.77      , -0.04      , ...,  0.        ,
         0.        ,  0.        ]])

In [32]:
masterSimilarData[0]

array([-0.39      ,  0.94      , -0.39      ,  1.        ,  4.        ,
        4.        ,  1.        ,  5.663     ,  4.        ,  0.5       ,
        0.10663077])

In [35]:
import statsmodels.api as sm
firstLevel = masterSimilarData[:,4:8].copy()
secondLevel = masterSimilarData[:,4:12].copy()
firstLevel = sm.tools.tools.add_constant(firstLevel, prepend=True)
secondLevel = sm.tools.tools.add_constant(secondLevel, prepend=True)
for x in range(4):
    mod = sm.OLS(masterSimilarData[:,x],firstLevel)
    res = mod.fit()
    print(res.summary())
    mod = sm.OLS(masterSimilarData[:,x],secondLevel)
    res = mod.fit()
    print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.594
Model:                            OLS   Adj. R-squared:                  0.594
Method:                 Least Squares   F-statistic:                     6454.
Date:                Fri, 14 Apr 2023   Prob (F-statistic):               0.00
Time:                        16:37:33   Log-Likelihood:                -909.85
No. Observations:               17626   AIC:                             1830.
Df Residuals:                   17621   BIC:                             1869.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1012      0.012      8.402      0.0