In [1]:
import os
import zipfile
import pandas as pd
import csv
import json
import nltk
from nltk import pos_tag, word_tokenize
import statistics
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import matplotlib as mpl
from random import shuffle

# Not runnable as of submission: requires data. See readme for content description. Contact me if anything is needed

In [2]:
def get_dwug_data(target_words=tuple(), target_pos=tuple()):
    '''
    returns worddata from dwug annotations
    args:
        target_words: iterable of words to return. If empty, returns all words
        target_pos: iterable of parts of speech to filter for. If empty, does not filter for part of speech
    '''
    
    dwug_datapath = './annotation_data/dwug_en/data/'
    worddata = {}
    #worddata['skipped_words'] = []
    skipped_words = []
    for word in os.listdir(dwug_datapath):
        if target_words:
            if not word in target_words:
                continue
        if target_pos:
            if not word.split('_')[1] in target_pos:
                continue
        wordpath = os.path.join(dwug_datapath, word)
        #print(word)
        try:
            # The files for some words do not get read properly by pandas, I think because they contain mid-string apostrophes.
            # Could be fixed with preprocessing, but my purpose with this data is to select a few appropriate words
            # with which to compare annotations, so I do not need all the words anyway.

            # Could set read_csv(on_bad_lines=skip) for the same effect as below, but this way I can track what I'm skipping.
            worddata[word] = {}
            worddata[word]['judgments'] = pd.read_csv(os.path.join(wordpath, 'judgments.csv'), sep='\t')
            worddata[word]['uses'] = pd.read_csv(os.path.join(wordpath, 'uses.csv'), sep='\t')
        except:
            print('Skipping', word)
            worddata.pop(word)
            skipped_words.append(word) # record of what has been skipped
    return worddata, skipped_words

In [3]:
def get_uses_and_pairs(worddata, skipped_words, prespecified_words=False, specifiers={'min_judgments': 3, 'num_words': 2, 'num_usepairs': 25}):

    '''
    returns pandas dataframes of uses and usepairs/pre-annotated judgment files in DURel readable format
    args:
        worddata: output format of get_dwug_data
        skipped_words: array-like of manually skipped lemmas
        prespecified_words: array-like of specific words to get. If truthy, other specifiers except 'num_usepairs' will be ignored
        specifiers:
            min_judgments: only get words with at least this many annotations
            num_words: return data for this many words, sorted to include those with the most annotations
            num_usepairs: number of edges included in usepairs file
        
    '''
    # Commented out but kept some code used only to familiarise myself with the data
    
    # Get counts of judgment pairs
    # judgment pair = unique combination of uses that cooccur in the annotation data, where pair-order does not matter

    if prespecified_words:
        print("Getting prespecified words. The only specifier filter that will be applied is 'num_usepairs'.")
        for word in skipped_words:
            if word in prespecified_words:
                print(word, 'was skipped in the get_data step and will not be returned')
    #groupingdict = {}
    perword_jpair_count = {}
    for word, usejudge in list(worddata.items()):
        if prespecified_words:
            if not word in prespecified_words:              
                continue

        judgments = usejudge['judgments']
        judgment_pairs = {}
        for i in range(len(judgments)): #could loop over judgments, but iterating over dataframes is sloooow
            row = judgments.iloc[i]
            id1 = row['identifier1']
            id2 = row['identifier2']
            identifiers = frozenset([id1, id2]) # set because order of pairs do not matter; frozenset to make it hashable

            # Block used to determine if uses are paired with uses of the same period
            # or only the other period:

            #usedata = worddata[word]['uses']
            #userow1 = usedata.loc[usedata['identifier'] == id1]
            # userow2 = usedata.loc[usedata['identifier'] == id2]
            # 'grouping' = the time period the use is associated with.

            #grouping1 = userow1['grouping'].iloc[0]
            #grouping2 = userow2['grouping'].iloc[0]
            #groupingstr = " ".join([str(grouping1), str(grouping2)])
            #if groupingstr not in groupingdict:
            #    groupingdict[groupingstr] = 0
            #groupingdict[groupingstr] +=1
            

            if row['judgment'] == 0: # 0-judgments are categorically different from the rest;
                                    # they indicate something like 'nonjudgment' (due to nonunderstanding etc.), not a similarity score.
                                    # Thus, they should not be used for the average, and I will not validate against it
                continue

            if not identifiers in judgment_pairs:
                #judgment_pairs[identifiers] = {'annotators': {}}
                judgment_pairs[identifiers] = [0, 0, []] # [count, sum(judgment), judgment list]
            #judgment_pairs[identifiers][row['annotator']] = row['judgment']
            judgment_pairs[identifiers][0]+=1
            judgment_pairs[identifiers][1]+=row['judgment']
            judgment_pairs[identifiers][2].append(row['judgment'])

        perword_jpair_count[word] = {k: v for k, v in sorted(judgment_pairs.items(), key=lambda item: item[1][0], reverse=True)}
        
    for word, jpairs in perword_jpair_count.items():
        for jpair, countsumjudg in jpairs.items():
            c, s, j = countsumjudg
            avrg = s/c
            median = statistics.median(j)
            perword_jpair_count[word][jpair] = [c, avrg, median]
            


    #print(groupingdict)
    # outcome: {'1 1': 16089, '2 1': 12736, '2 2': 16967, '1 2': 18727}
    # Mixed pairing, which is expected from the paper, but this confirms the approach I will use for my sampling

    if not prespecified_words:
        filtered_perword_jpair_count = {} # 
        for word, fsets in list(perword_jpair_count.items()):
            filtered_perword_jpair_count[word] = {}

            filtered_perword_jpair_count[word] = {fset: count_avg for fset, count_avg in fsets.items() if count_avg[0] >= specifiers['min_judgments']}

        sorted_filtered = {k: v for k, v in sorted(filtered_perword_jpair_count.items(), key=lambda item: len(item[1]), reverse=True)}
    
        try:
            selected_words = list(sorted_filtered.items())[:specifiers['num_words']]
        except:
            print('Current filters find fewer than specified {} words. Returning all words.'.format(specifiers['num_words']))
            selected_words = list(sorted_filtered.items())
    else:
        selected_words = list(perword_jpair_count.items())


    final_pairs = {}  
    uses = {}
    for word, fsets in selected_words:
        try:
            selected_pairs = list(fsets.items())[:specifiers['num_usepairs']]
        except:
            print("Current filters find fewer than specified {} usepairs for word '{}'. Returning all usepairs.".format(specifiers['num_usepairs'], word))
            selected_pairs = list(fsets.items())
        final_pairs[word] = {fset: countavg for fset, countavg in selected_pairs}
        
        uses[word] = set()
        for fset in selected_pairs:

            uses[word].update(fset[0])

    usepair_dfs = {} #final pair format

    # Reformat uses
    for word, fsets in final_pairs.items():

        pandas_durel_format = {}
        lemmas = []
        id1s = []
        id2s = []
        avg_judgs = []
        medians = []
        for fset, metrics in fsets.items():
            lemmas.append(word)
            id1s.append(list(fset)[0])
            id2s.append(list(fset)[1])
            avg_judgs.append(metrics[1])
            medians.append(metrics[2])     
        pandas_durel_format['lemma'] = lemmas
        pandas_durel_format['identifier1'] = id1s
        pandas_durel_format['identifier2'] = id2s
        pandas_durel_format['avg_judgment'] = avg_judgs
        pandas_durel_format['median_judgment'] = medians


        usepair_dfs[word] = pd.DataFrame.from_dict(pandas_durel_format)

    

    use_dfs = {} # final use format
    for word, fsets in final_pairs.items(): # 
        usedata = worddata[word]['uses']

        filtered_df = usedata.loc[usedata['identifier'].isin(uses[word])] # filter original dataframe to only include uses from among my selected usepairs
        filtered_df = filtered_df.reset_index(drop=True)
        filtered_df = filtered_df.drop(columns=['context_tokenized', 'indexes_target_token_tokenized', 'indexes_target_sentence_tokenized', 'context_lemmatized', 'context_pos'])

        use_dfs[word] = filtered_df

    return usepair_dfs, use_dfs




In [4]:
def get_sentences(word_dict, return_vocab=False):
    '''
    Gets sentences from CCOHA sample
    word_dict: dict where keys = lemmas and values = {'forms': (word forms to match,), 'tags': (NLTK tags to match,)}
    values of keys 'forms', 'tags' must be array-like even if only singleton member. 'lemma' can be more than just the lemma,
    (e.g. '[lemma]_nn' to differentiate lemmas of different parts of speech with the same form), in which case the dict should have key
    'search_lemma' with value [lemma]
    '''
    
    if return_vocab:
        vocab = {}
    sentence_ids = {}
    sentences = {}
    lemmas = word_dict.keys()
    for lemma in lemmas:
        sentence_ids[lemma] = {}
        sentences[lemma] = {}

    print('Finding lemma locations...')    
    for corpus_i in range(1,3): # corpus_i = 1 or 2, the ids for the timeperiods
        for lemma in lemmas:
            sentence_ids[lemma][str(corpus_i)] = []
        with open(os.path.join(os.getcwd(), 'semeval2020_ulscd_eng', 'corpus'+str(corpus_i), 'lemma', 'ccoha'+str(corpus_i)+'.txt')) as f:
            lines = f.read().split('\n')
            for i, line in enumerate(lines):
                for lemma in lemmas:
                    if 'search_lemma' in word_dict[lemma]:
                        search_lemma = word_dict[lemma]['search_lemma']
                    else:
                        search_lemma = lemma
                    
                    if search_lemma in line:
                        sentence_ids[lemma][str(corpus_i)].append(i)

    c = 0
    easy_puncts = [".", ",", "!", "?", ":", ";"] # punctuation that is nearly universally caught by an align-with-lefthand-token heuristic
                                        # obviously not ideal, but this is only for some slight readibility improvements and is not
                                        # computationally necessary
    used_identifiers = {}
    for lemma in lemmas:
        used_identifiers[lemma] = [0]
        # want the identifier to be unique per lemma, not per lemma per corpus, but I'm also keeping the corpora separate for now,
        # so this is the memory solution
    print('Retrieving data for...')
    for corpus_i in range(1, 3):
        print('\tCorpus group', corpus_i)
        for lemma in lemmas:
            sentences[lemma][str(corpus_i)] = []
        with open(os.path.join(os.getcwd(), 'semeval2020_ulscd_eng', 'corpus'+str(corpus_i), 'token', 'ccoha'+str(corpus_i)+'.txt')) as f:
            lines = f.read().split('\n')
            for lemma in lemmas:
 
                print('\t\t', lemma)
                for id in sentence_ids[lemma][str(corpus_i)]:
                    token_line = lines[id]
                    toktagged = pos_tag(word_tokenize(token_line))
                    toks = [tok for tok, tag in toktagged]
                    tags = [tag for tok, tag in toktagged]

                    punct_ids = {}
                    for i in range(len(tags)):
                        if toks[i] in easy_puncts:
                            punct_ids[i] = toks[i]

                    for i, tok in enumerate(toks):
                        if tok.lower() in word_dict[lemma]['forms']: # if the word occurs multiple times, they will get separate entries with the same overall context
                                                                    # (not exactly the same left/right context)
                                                                    # this doesn't seem like a fundamental problem since they are separate uses, but if a lot of uses are from the
                                                                    # same source, the most prevalently used sense might be overrepresented
                            if tags[i] in word_dict[lemma]['tags'] or not word_dict[lemma]['tags']:
                                if not word_dict[lemma]['tags']:
                                    #input dict['tags'] can be empty, in which case word match is enough
                                    #still needs to be in input dict as a falsy iterable, or there will be an error
                                    pos = ''
                                else:
                                    pos = [tag for tag in word_dict[lemma]['tags'] if tag == tags[i]][0]
                                
                                grouping = str(corpus_i)
                                identifier = used_identifiers[lemma][-1]+1
                                used_identifiers[lemma].append(identifier)
                                #context = lines[id] # technically better context, but complicates getting specific slices of character indices for durel

                                target_len = len(tok)
                                target_start = len(' '.join(toks[:i]))+1 
                                
                                # the following until -------------- is a more convoluted than technically necessary way of getting the context.
                                # It is meant to produce contexts without uncessary spaces produced by tokenisation for at least some punctuation,
                                # which is only for readability.
                                # it loops over the line a lot and is thus (relatively) quite a bit more computationally heavy than an approach that takes
                                # the extra spaces as they are. It's not too much computation for my purposes, but could be a problem for more data.
                                target_movement = 0
                                context = []
                                #print('len toks',len(toks))
                                for j in range(len(toks)):
                                    if toks[j] in easy_puncts:
                                        if j < i:
                                            target_start -=1 # punctuation tokens being merged with previous tokens = 
                                                            # fewer spaces added when joining token lists = 
                                                            # target index moves
                                        continue
                                    k = 1
                                    updated_token = toks[j]

                                    if j+k < len(toks):                                    
                                        while toks[j+k] in easy_puncts:
                                            updated_token = updated_token+toks[j+k]
                                            k+=1
                                            if j+k >= len(toks):
                                                break
                                            
                                    context.append(updated_token)


                                new_i = i-target_movement

                                context = ' '.join(context)
                                # ----------------------------


                                indexes_target_token = '{}:{}'.format(str(target_start), str(target_start+target_len))

                                indexes_target_sentence = '0:{}'.format(len(context))
                                                     
                                # not sure if the empty fields are necessary for durel, but might as well include them since work down the line could want to fill them
                                durel_dict = {'lemma': lemma,
                                              'pos': pos,
                                              'date': '', 
                                              'grouping': grouping,
                                              'identifier': lemma+'_'+str(identifier),
                                              'description': '',
                                              'context': context,
                                              'indexes_target_token': indexes_target_token,
                                              'indexes_target_sentence': indexes_target_sentence
                                              } # format for pandas/durel
                                sentences[lemma][str(corpus_i)].append(durel_dict)

    return sentences

            
        

In [9]:
def get_vocab():
    vocab = {}
    for corpus_i in range(1,3):
        with open(os.path.join(os.getcwd(), 'semeval2020_ulscd_eng', 'corpus'+str(corpus_i), 'lemma', 'ccoha'+str(corpus_i)+'.txt')) as f:
            words = f.read().split()
            for word in words:
                if not word in vocab:
                    vocab[word] = {}
                if str(corpus_i) not in vocab[word]:
                    vocab[word][str(corpus_i)] = 0
                vocab[word][str(corpus_i)] +=1
    for word, groupcount in vocab.items():
        for group in ('1', '2'):
            if group not in groupcount:
                vocab[word][group] = 0
    return vocab    


In [10]:
def filter_vocab(vocab, mincount=0, maxcount = float('inf'), slicer_pattern=None):
    '''
    vocab: vocab with structure as per output of get_vocab
    mincount: return words with higher count than this in BOTH corpora
    maxcount: return words with lower count than this in BOTH corpora
    slicer_pattern: Either 2-place array-like (slice(parts of word to inspect), 'string pattern to match').
                    E.g., (slice(-2, None), 'ed') checks if the last two letters of the word is 'ed' OR
                    Falsy value, in which case no pattern-based filtering is done                    
    '''
    # mincount to get words with high enough sample, maxcount to filter out very common words that might not be sought
    # slicer_pattern can also be just a string, in which case the only returned vocab key will be the one exactly matching that input.
    # However, this is only minorly different from just calling the vocab with that string, so makes the function call a bit redundant

    fil_vocab = {}
    if slicer_pattern:
        if type(slicer_pattern) == str: # lets slicer_pattern basically just 
            slicer = slice(0, None)
            pattern = slicer_pattern
        else:
            slicer = slicer_pattern[0]
            pattern = slicer_pattern[1]
    for word, groupcount in vocab.items():
        if mincount <= groupcount['1'] <= maxcount and mincount <= groupcount['2'] <= maxcount:
            keep = True
            if slicer_pattern:
                if not word[slicer] == pattern:
                    keep = False
            if keep:
                fil_vocab[word] = vocab[word]

    vocab_items = list(fil_vocab.items())
    value_sum_sorted = sorted(vocab_items, key=lambda x: sum(x[1].values()), reverse=True)
    sorted_vocab = {word: values for word, values in value_sum_sorted}

    return sorted_vocab

    

In [11]:
vocab = get_vocab()

In [12]:
filtered_vocab = filter_vocab(vocab, mincount = 5, slicer_pattern=(slice(-2, None), 'ed'))

In [13]:
filtered_vocab

{'need': {'1': 1600, '2': 4590},
 'united': {'1': 1584, '2': 2052},
 'indeed': {'1': 2344, '2': 775},
 'hundred': {'1': 1903, '2': 1030},
 'bed': {'1': 1012, '2': 1785},
 'red': {'1': 1004, '2': 1765},
 'proceed': {'1': 1501, '2': 289},
 'succeed': {'1': 827, '2': 322},
 'speed': {'1': 391, '2': 693},
 'feed': {'1': 344, '2': 666},
 'supposed': {'1': 383, '2': 575},
 'concerned': {'1': 233, '2': 513},
 'interested': {'1': 236, '2': 509},
 'sacred': {'1': 550, '2': 148},
 'married': {'1': 239, '2': 423},
 'deed': {'1': 571, '2': 86},
 'shed': {'1': 397, '2': 197},
 'tired': {'1': 185, '2': 393},
 'seed': {'1': 322, '2': 240},
 'distinguished': {'1': 440, '2': 95},
 'pleased': {'1': 337, '2': 169},
 'naked': {'1': 222, '2': 277},
 'involved': {'1': 125, '2': 364},
 'exceed': {'1': 323, '2': 132},
 'beloved': {'1': 290, '2': 112},
 'increased': {'1': 153, '2': 240},
 'inclined': {'1': 280, '2': 104},
 'limited': {'1': 141, '2': 237},
 'ed': {'1': 68, '2': 301},
 'armed': {'1': 124, '2': 2

In [6]:
def inspect_contexts(per_word_sentences):
    for word, groupdata in per_word_sentences.items():
        print('word:',word)
        print()
        for group, data in groupdata.items():
            print('group:',group)
            print("instances of '{}' in corpus {}: {}".format(word, group, len(data)))
            print()
            for d in data:
                print('identifier:',d['identifier'])
                print('context:')
                print(d['context'])
                print()
            print()
        print()
    print('-----------------')
    print()
    print()
        

In [5]:
def get_use_dfs(per_word_sentences, target_len=None, ignored_ids=None):
    '''
    returns {word: dataframe} for each word in input
    per_word_sentences: output format of get_sentences
    target_len: uses per word per corpus. If target_len is provided: output df has target_len*2 rows unless a corpus has fewer uses of that word,
                in which case all are returned. If not target_len: all uses are returned
    ignored_ids: specific lines to avoid, e.g., in case of uninterpretable use
    '''
    
    print('Getting dataframes of...')
    pws = per_word_sentences.copy()
    print(ignored_ids)
    if ignored_ids: # for if context inspection revealed any problematic examples
        for word, indices in ignored_ids.items():
            for group in ['1', '2']:
                to_pop = []
                for i, data in enumerate(pws[word][group]):
                    print(data['identifier'])
                    if data['identifier'] in indices: # the identifiers will no longer be consecutive if any are removed,
                                                        #but they just need to be unique
                        print('found ', data['identifier'])
                        to_pop.append(i)
                for idx in to_pop:
                    pws[word][group].pop(idx)

    filtered_uses = {}
    for word in pws.keys():
        print('\t', word)
        merged_corpora = []
        for group in ['1', '2']:
            uses = pws[word][group]
            corpus_target_len = target_len
            if corpus_target_len:
                if corpus_target_len > len(uses):
                    corpus_target_len = None
                    print("For '{}', corpus {}: Assigned target length exceeds existing instances. Returning all instances.".format(word, group))
            if corpus_target_len:
                shuffle(uses)
                uses = uses[:target_len]

            print("'{}' group {}: {} uses in dataframe.".format(word, group, len(uses)))
            merged_corpora.extend(uses)
        filtered_uses[word] = merged_corpora
    

    use_dfs = {}
    for word, data in filtered_uses.items():
        use_dfs[word] = pd.DataFrame(data)


    return use_dfs

                


                
        


In [6]:
def write_to_file(df_dict, df_type, dirname):
    rootdir = os.path.join(os.getcwd(), dirname)
    if not os.path.exists(rootdir):
        os.mkdir(rootdir)
    for word, df in df_dict.items():
        worddir = os.path.join(rootdir, word)
        if not os.path.exists(worddir):
            os.mkdir(worddir)
        filepath = os.path.join(worddir, word+'_'+df_type)
        df.to_csv(path_or_buf=filepath, sep='\t', index=False)

In [68]:
def reannotation_update():
    '''
    updates previous annotations with new annotation data. Adds new judgment and sentences for better interpretability
    '''

    reannotations = os.listdir('./reannotations')
    new_data = []
    for word in reannotations:
       
        if word in os.listdir('./validation_annotations'):
            new_annotations = pd.read_csv(os.path.join(os.getcwd(), 'reannotations', word, 'annotations.csv'), sep='\t')
            annotpath = os.path.join(os.getcwd(), 'validation_annotations', word, word+'_instances')
            annotations = pd.read_csv(annotpath, sep='\t')
            # DURel shuffles the pairs, so I can't just make a new column with list(new_annotations['judgment'])

            identifiers_to_judgment = {frozenset((new_annotations.iloc[i]['identifier1'], new_annotations.iloc[i]['identifier2'])):
                             new_annotations.iloc[i]['judgment'] for i in range(len(new_annotations))}
            
            # These are not necessary for computation, but it allows for the inspection of the usepairs in text-format
            # (now that I no longer need to have the DURel format to take into account):
            usedf = pd.read_csv(os.path.join(os.getcwd(), 'validation_annotations', word, word+'_uses'), sep='\t')
            id_to_text = {usedf.iloc[i]['identifier']: usedf.iloc[i]['context'] for i in range(len((usedf)))}
            new_judgments = []
            sentences = []

            for i in range(len(annotations)):
                ids = frozenset((annotations.iloc[i]['identifier1'], annotations.iloc[i]['identifier2']))
                new_judgment = identifiers_to_judgment[ids]
                text_format = [id_to_text[identifier] for identifier in list(ids)]
                new_judgments.append(new_judgment)
                sentences.append(text_format)
                
            annotations['new_judgments'] = new_judgments
            annotations['sentences'] = sentences

            annotations.to_csv(path_or_buf=annotpath, sep='\t', index=False)


In [29]:
def get_rank_correlations():

    words = os.listdir('./validation_annotations')

    correlations = {'Mean': [], 'Median': []}

    combined_means = []
    combined_medians = []
    combined_new = []
    divergence_data = {'combined': {'total':0, 'same':0, 'one': 0, 'more':0}}
    divergence_data_pointfives = {'combined': {'total':0, 'same':0, 'one': 0, 'more':0}}


    num_point5 = {}
    
    for word in words:
        annotations = pd.read_csv(os.path.join(os.getcwd(), 'validation_annotations', word, word+'_instances'), sep='\t')
        means = list(annotations['avg_judgment'])
        medians = list(annotations['median_judgment'])
        new = list(annotations['new_judgments'])
        
        combined_means.extend(means)
        combined_medians.extend(medians)
        combined_new.extend(new)

        num_point5[word] = len([m for m in medians if str(m)[-1] =='5'])

        #meanscorr = round(spearmanr(new, means)[0], 2)
        #medianscorr = round(spearmanr(new, medians)[0],2)
        meanscorr = spearmanr(new, means)[0]
        medianscorr = spearmanr(new, medians)[0]
        correlations['Mean'].append(meanscorr)
        correlations['Median'].append(medianscorr)

        divergence_data_pointfives[word] = {'total':0, 'same':0, 'one': 0, 'more':0}
        divergence_data[word]= {'total':0, 'same':0, 'one': 0, 'more':0}
        
        for m, n in zip(medians, new):
            pointfive= False
            if str(m)[-1] == '5':
                pointfive=True
                #divergence_data_pointfives[word]['total']+=1
                #divergence_data_pointfives['combined']['total']+=1


            if m == n:
                if not pointfive:
                    divergence_data[word]['same']+=1
                    divergence_data['combined']['same']+=1
                divergence_data_pointfives[word]['same']+=1
                divergence_data_pointfives['combined']['same']+=1
            elif -1 <= m-n <= 1:
                if not pointfive:
                    divergence_data[word]['one']+=1
                    divergence_data['combined']['one']+=1
                divergence_data_pointfives[word]['one']+=1
                divergence_data_pointfives['combined']['one']+=1
            else:

                if not pointfive:
                    divergence_data[word]['more']+=1
                    divergence_data['combined']['more']+=1
                divergence_data_pointfives[word]['more']+=1
                divergence_data_pointfives['combined']['more']+=1

            if not pointfive:
                divergence_data[word]['total']+=1
                divergence_data['combined']['total']+=1
            divergence_data_pointfives[word]['total']+=1
            divergence_data_pointfives['combined']['total']+=1
            

    # Divergence data was just some post-hoc inclusions and is only briefly mentioned in the paper

    for k in divergence_data.keys():
        divergence_data[k]['same%'] = divergence_data[k]['same']/divergence_data[k]['total']
        divergence_data[k]['one%'] = divergence_data[k]['one']/divergence_data[k]['total']
        divergence_data[k]['more%'] = divergence_data[k]['more']/divergence_data[k]['total']

        divergence_data_pointfives[k]['same%'] = divergence_data_pointfives[k]['same']/divergence_data_pointfives[k]['total']
        divergence_data_pointfives[k]['one%'] = divergence_data_pointfives[k]['one']/divergence_data_pointfives[k]['total']
        divergence_data_pointfives[k]['more%'] = divergence_data_pointfives[k]['more']/divergence_data_pointfives[k]['total']




        

    

        #correlations[word] = {}
        #correlations[word]['Mean'] = spearmanr(new, means)
        #correlations[word]['Median'] = spearmanr(new, medians)

    if len(words) > 1:
        if len(words) == 2:
            comb='both words'
        else:
            comb='all words'
        words.append(comb)
        #meanscorr = round(spearmanr(combined_new, combined_means)[0], 2)
        #medianscorr = round(spearmanr(combined_new, combined_medians)[0],2)
        meanscorr = spearmanr(combined_new, combined_means)[0]
        medianscorr = spearmanr(combined_new, combined_medians)[0]
        correlations['Mean'].append(meanscorr)
        correlations['Median'].append(medianscorr)
    
        #correlations[comb] = {}
        #correlations[comb]['Mean'] = spearmanr(combined_new, combined_means)
        #correlations[comb]['Median'] = spearmanr(combined_new, combined_medians)

    #for word in words:
    #    correlations['Word'].append(word)

    print(correlations)


    print('both, median', spearmanr(combined_new, combined_medians))
    print('both, means', spearmanr(combined_new, combined_means))

    print(correlations)

    df = pd.DataFrame.from_dict(correlations)
    #print(df)
    #return df, num_point5

    def df_style(styler):
        styler.set_caption("Correlation of new annotations with existing ones")
        styler.format(precision=3)
        styler.format_index(str.upper, axis=1)
        styler.relabel_index(words, axis=0)
        return styler
    

    import dataframe_image as dfi

    #df = pd.DataFrame(np.random.rand(6,4))
    df_styled = df.style \
            .format(precision=3) \
            .format_index(str.upper, axis=1) \
            .relabel_index(words, axis=0) \
            .set_caption("Validation correlations")

    dfi.export(df_styled, 'correlations.png',table_conversion = 'matplotlib')
    return df, divergence_data, divergence_data_pointfives
   

In [30]:
df, dd, ddp = get_rank_correlations()

{'Mean': [0.8826923947615539, 0.5553316587662009, 0.8371273389499156], 'Median': [0.8699774895544058, 0.5385252730345806, 0.8258126178146006]}
both, median SignificanceResult(statistic=0.8258126178146006, pvalue=1.576019201332923e-13)
both, means SignificanceResult(statistic=0.8371273389499156, pvalue=3.6002201300487115e-14)
{'Mean': [0.8826923947615539, 0.5553316587662009, 0.8371273389499156], 'Median': [0.8699774895544058, 0.5385252730345806, 0.8258126178146006]}


In [26]:
df

Unnamed: 0,Mean,Median
0,0.882692,0.869977
1,0.555332,0.538525
2,0.837127,0.825813


In [31]:
ddp

{'combined': {'total': 50,
  'same': 30,
  'one': 20,
  'more': 0,
  'same%': 0.6,
  'one%': 0.4,
  'more%': 0.0},
 'record_nn': {'total': 25,
  'same': 16,
  'one': 9,
  'more': 0,
  'same%': 0.64,
  'one%': 0.36,
  'more%': 0.0},
 'stab_nn': {'total': 25,
  'same': 14,
  'one': 11,
  'more': 0,
  'same%': 0.56,
  'one%': 0.44,
  'more%': 0.0}}

In [32]:
dd

{'combined': {'total': 36,
  'same': 30,
  'one': 6,
  'more': 0,
  'same%': 0.8333333333333334,
  'one%': 0.16666666666666666,
  'more%': 0.0},
 'record_nn': {'total': 19,
  'same': 16,
  'one': 3,
  'more': 0,
  'same%': 0.8421052631578947,
  'one%': 0.15789473684210525,
  'more%': 0.0},
 'stab_nn': {'total': 17,
  'same': 14,
  'one': 3,
  'more': 0,
  'same%': 0.8235294117647058,
  'one%': 0.17647058823529413,
  'more%': 0.0}}

In [173]:
reannotation_update()

In [9]:
worddata, skipped_words = get_dwug_data()
usepair_dfs, use_dfs = get_uses_and_pairs(worddata, skipped_words=skipped_words, prespecified_words=('record_nn','stab_nn'))


Skipping attack_nn
Skipping gas_nn
Skipping lass_nn
Skipping word_nn
Getting prespecified words. The only specifier filter that will be applied is 'num_usepairs'.


In [20]:
write_to_file(usepair_dfs, 'instances')
write_to_file(use_dfs, 'uses')

In [21]:
candidates = {'disabled': {'forms': ('disabled',), 'tags': ('JJ', 'JJR', 'JJS')},
              'crippled': {'forms': ('crippled',), 'tags': ('JJ', 'JJR', 'JJS')},
              'depressed': {'forms': ('depressed',), 'tags': ('JJ', 'JJR', 'JJS')},
              'lame': {'forms': ('lame', 'lamer', 'lamest'), 'tags': ('JJ', 'JJR', 'JJS')},
              'sentence': {'forms': ('sentence', 'sentences'), 'tags': ('NN', 'NNS')},
              'meaning': {'forms': ('meaning', 'meanings'), 'tags': ('NN', 'NNS')},
              'hit': {'forms': ('hit', 'hits', 'hitting'), 'tags': ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ')},
              'strike_nn': {'forms': ('strike', 'strikes'), 'tags': ('NN', 'NNS'), 'search_lemma': 'strike'},
              'stroke_nn': {'forms': ('stroke', 'strokes'), 'tags': ('NN', 'NNS'), 'search_lemma': 'stroke'},
              'strike_vb': {'forms': ('strike', 'strikes','struck', 'striking'), 'tags': ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'), 'search_lemma': 'strike'},
              #'hit': {'forms': ('hit',), 'tags': ('NN', 'NNS')},
              'meter': {'forms': ('meter', 'metre', 'meters', 'metres'), 'tags': ('NN', 'NNS')},
              'hand': {'forms': ('hand', 'hands'), 'tags': ('NN', 'NNS')}, # head as validationautomation 
              'automation': {'forms': ('automation',), 'tags': ('NN', 'NNS')}, # not really NNS (and it likely does not occur), but shouldn't catch unsought meanings             

              }
candidates2 = {'lame': {'forms': ('lame', 'lamer', 'lamest'), 'tags': ('JJ', 'JJR', 'JJS')},
               'depressed': {'forms': ('depressed',), 'tags': ('JJ', 'JJR', 'JJS')},
               'crippled': {'forms': ('crippled',), 'tags': ('JJ', 'JJR', 'JJS')},
               'disabled': {'forms': ('disabled',), 'tags': ('JJ', 'JJR', 'JJS')},
               }

per_word_sentences = get_sentences(candidates2)
ccoha_use_dfs = get_use_dfs(per_word_sentences, ignored_ids={'lame': ('lame_30','lame_3', 'lame_33' ), }, target_len=10) #ignored_ids={'disabled':(1,2,5)

Finding lemma locations...
Retrieving data for...
	Corpus group 1
		 lame
		 depressed
		 crippled
		 disabled
	Corpus group 2
		 lame
		 depressed
		 crippled
		 disabled
Getting dataframes of...
{'lame': ('lame_30', 'lame_3', 'lame_33')}
lame_1
lame_2
lame_3
found  lame_3
lame_4
lame_5
lame_6
lame_7
lame_8
lame_9
lame_10
lame_11
lame_12
lame_13
lame_14
lame_15
lame_16
lame_17
lame_18
lame_19
lame_20
lame_21
lame_22
lame_23
lame_24
lame_25
lame_26
lame_27
lame_28
lame_29
lame_30
found  lame_30
lame_31
lame_32
lame_33
found  lame_33
lame_34
lame_35
	 lame
'lame' group 1: 10 uses in dataframe.
'lame' group 2: 10 uses in dataframe.
	 depressed
'depressed' group 1: 10 uses in dataframe.
'depressed' group 2: 10 uses in dataframe.
	 crippled
For 'crippled', corpus 1: Assigned target length exceeds existing instances. Returning all instances.
'crippled' group 1: 8 uses in dataframe.
For 'crippled', corpus 2: Assigned target length exceeds existing instances. Returning all instances.
'cripple

In [22]:
write_to_file(ccoha_use_dfs, 'uses')

In [94]:
def main():
    worddata, skipped_words = get_dwug_data()#target_words=('stab_nn', 'savage_nn'), target_pos=('vb')
    usepair_dfs, use_dfs = get_uses_and_pairs(worddata, skipped_words=skipped_words, prespecified_words=('word_nn','stab_nn'))

    candidates = {'disabled': {'forms': ('disabled',), 'tags': ('JJ', 'JJR', 'JJS')},
              'crippled': {'forms': ('crippled',), 'tags': ('JJ', 'JJR', 'JJS')},
              'depressed': {'forms': ('depressed',), 'tags': ('JJ', 'JJR', 'JJS')},
              'lame': {'forms': ('lame', 'lamer', 'lamest'), 'tags': ('JJ', 'JJR', 'JJS')}
              }
    per_word_sentences = get_sentences(candidates)
    ccoha_use_dfs = get_use_dfs(per_word_sentences, target_len=10) #ignored_ids={'disabled':(1,2,5)